mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-28 02:53:30 +00:00
chore: remove usage of load_tiktoken_bpe (#2276)
This commit is contained in:
parent
af65207ebd
commit
1c0c6e1e17
6 changed files with 234 additions and 17 deletions
40
llama_stack/models/llama/tokenizer_utils.py
Normal file
40
llama_stack/models/llama/tokenizer_utils.py
Normal file
|
@ -0,0 +1,40 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import base64
|
||||
from pathlib import Path
|
||||
|
||||
from llama_stack.log import get_logger
|
||||
|
||||
logger = get_logger(__name__, "tokenizer_utils")
|
||||
|
||||
|
||||
def load_bpe_file(model_path: Path) -> dict[bytes, int]:
|
||||
"""
|
||||
Load BPE file directly and return mergeable ranks.
|
||||
|
||||
Args:
|
||||
model_path (Path): Path to the BPE model file.
|
||||
|
||||
Returns:
|
||||
dict[bytes, int]: Dictionary mapping byte sequences to their ranks.
|
||||
"""
|
||||
mergeable_ranks = {}
|
||||
|
||||
with open(model_path, encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
|
||||
for line in content.splitlines():
|
||||
if not line.strip(): # Skip empty lines
|
||||
continue
|
||||
try:
|
||||
token, rank = line.split()
|
||||
mergeable_ranks[base64.b64decode(token)] = int(rank)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse line '{line}': {e}")
|
||||
continue
|
||||
|
||||
return mergeable_ranks
|
Loading…
Add table
Add a link
Reference in a new issue