mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-27 17:11:59 +00:00
The `load_tiktoken_bpe()` function depends on blobfile to load tokenizer.model files. However, blobfile brings in pycryptodomex, which is primarily used for JWT signing in GCP - functionality we don’t require, as we always load tokenizers from local files. pycryptodomex implements its own cryptographic primitives, which are known to be problematic and insecure. While blobfile could potentially switch to the more secure PyCA cryptography library, the project appears inactive, so this transition may not happen soon. Fortunately, `load_tiktoken_bpe()` is a simple function that just reads a BPE file and returns a dictionary mapping byte sequences to their mergeable ranks. It’s straightforward enough for us to implement ourselves. Signed-off-by: Sébastien Han <seb@redhat.com>
40 lines
1.1 KiB
Python
40 lines
1.1 KiB
Python
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
# All rights reserved.
|
|
#
|
|
# This source code is licensed under the terms described in the LICENSE file in
|
|
# the root directory of this source tree.
|
|
|
|
import base64
|
|
from pathlib import Path
|
|
|
|
from llama_stack.log import get_logger
|
|
|
|
logger = get_logger(__name__, "tokenizer_utils")
|
|
|
|
|
|
def load_bpe_file(model_path: Path) -> dict[bytes, int]:
|
|
"""
|
|
Load BPE file directly and return mergeable ranks.
|
|
|
|
Args:
|
|
model_path (Path): Path to the BPE model file.
|
|
|
|
Returns:
|
|
dict[bytes, int]: Dictionary mapping byte sequences to their ranks.
|
|
"""
|
|
mergeable_ranks = {}
|
|
|
|
with open(model_path, encoding="utf-8") as f:
|
|
content = f.read()
|
|
|
|
for line in content.splitlines():
|
|
if not line.strip(): # Skip empty lines
|
|
continue
|
|
try:
|
|
token, rank = line.split()
|
|
mergeable_ranks[base64.b64decode(token)] = int(rank)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to parse line '{line}': {e}")
|
|
continue
|
|
|
|
return mergeable_ranks
|