mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-27 18:50:41 +00:00
40 lines
1.1 KiB
Python
40 lines
1.1 KiB
Python
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
# All rights reserved.
|
|
#
|
|
# This source code is licensed under the terms described in the LICENSE file in
|
|
# the root directory of this source tree.
|
|
|
|
import base64
|
|
from pathlib import Path
|
|
|
|
from llama_stack.log import get_logger
|
|
|
|
logger = get_logger(__name__, "tokenizer_utils")
|
|
|
|
|
|
def load_bpe_file(model_path: Path) -> dict[bytes, int]:
|
|
"""
|
|
Load BPE file directly and return mergeable ranks.
|
|
|
|
Args:
|
|
model_path (Path): Path to the BPE model file.
|
|
|
|
Returns:
|
|
dict[bytes, int]: Dictionary mapping byte sequences to their ranks.
|
|
"""
|
|
mergeable_ranks = {}
|
|
|
|
with open(model_path, encoding="utf-8") as f:
|
|
content = f.read()
|
|
|
|
for line in content.splitlines():
|
|
if not line.strip(): # Skip empty lines
|
|
continue
|
|
try:
|
|
token, rank = line.split()
|
|
mergeable_ranks[base64.b64decode(token)] = int(rank)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to parse line '{line}': {e}")
|
|
continue
|
|
|
|
return mergeable_ranks
|