mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-25 18:54:30 +00:00
fix(utils.py): default claude-3 to tiktoken (0.8s faster than hf tokenizer)
This commit is contained in:
parent
46a81524ab
commit
a1dd341ca1
2 changed files with 57 additions and 21 deletions
|
@ -10,6 +10,7 @@ sys.path.insert(
|
||||||
) # Adds the parent directory to the system path
|
) # Adds the parent directory to the system path
|
||||||
import time
|
import time
|
||||||
from litellm import token_counter, create_pretrained_tokenizer, encode, decode
|
from litellm import token_counter, create_pretrained_tokenizer, encode, decode
|
||||||
|
from litellm.tests.large_text import text
|
||||||
|
|
||||||
|
|
||||||
def test_token_counter_normal_plus_function_calling():
|
def test_token_counter_normal_plus_function_calling():
|
||||||
|
@ -70,10 +71,14 @@ def test_tokenizers():
|
||||||
)
|
)
|
||||||
|
|
||||||
# llama3 tokenizer (also testing custom tokenizer)
|
# llama3 tokenizer (also testing custom tokenizer)
|
||||||
llama3_tokens_1 = token_counter(model="meta-llama/llama-3-70b-instruct", text=sample_text)
|
llama3_tokens_1 = token_counter(
|
||||||
|
model="meta-llama/llama-3-70b-instruct", text=sample_text
|
||||||
|
)
|
||||||
|
|
||||||
llama3_tokenizer = create_pretrained_tokenizer("Xenova/llama-3-tokenizer")
|
llama3_tokenizer = create_pretrained_tokenizer("Xenova/llama-3-tokenizer")
|
||||||
llama3_tokens_2 = token_counter(custom_tokenizer=llama3_tokenizer, text=sample_text)
|
llama3_tokens_2 = token_counter(
|
||||||
|
custom_tokenizer=llama3_tokenizer, text=sample_text
|
||||||
|
)
|
||||||
|
|
||||||
print(
|
print(
|
||||||
f"openai tokens: {openai_tokens}; claude tokens: {claude_tokens}; cohere tokens: {cohere_tokens}; llama2 tokens: {llama2_tokens}; llama3 tokens: {llama3_tokens_1}"
|
f"openai tokens: {openai_tokens}; claude tokens: {claude_tokens}; cohere tokens: {cohere_tokens}; llama2 tokens: {llama2_tokens}; llama3 tokens: {llama3_tokens_1}"
|
||||||
|
@ -84,7 +89,9 @@ def test_tokenizers():
|
||||||
openai_tokens != cohere_tokens != llama2_tokens != llama3_tokens_1
|
openai_tokens != cohere_tokens != llama2_tokens != llama3_tokens_1
|
||||||
), "Token values are not different."
|
), "Token values are not different."
|
||||||
|
|
||||||
assert llama3_tokens_1 == llama3_tokens_2, "Custom tokenizer is not being used! It has been configured to use the same tokenizer as the built in llama3 tokenizer and the results should be the same."
|
assert (
|
||||||
|
llama3_tokens_1 == llama3_tokens_2
|
||||||
|
), "Custom tokenizer is not being used! It has been configured to use the same tokenizer as the built in llama3 tokenizer and the results should be the same."
|
||||||
|
|
||||||
print("test tokenizer: It worked!")
|
print("test tokenizer: It worked!")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -147,3 +154,36 @@ def test_gpt_vision_token_counting():
|
||||||
|
|
||||||
|
|
||||||
# test_gpt_vision_token_counting()
|
# test_gpt_vision_token_counting()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"model",
|
||||||
|
[
|
||||||
|
"gpt-4-vision-preview",
|
||||||
|
"gpt-4o",
|
||||||
|
"claude-3-opus-20240229",
|
||||||
|
"command-nightly",
|
||||||
|
"mistral/mistral-tiny",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_load_test_token_counter(model):
|
||||||
|
"""
|
||||||
|
Token count large prompt 100 times.
|
||||||
|
|
||||||
|
Assert time taken is < 1.5s.
|
||||||
|
"""
|
||||||
|
import tiktoken
|
||||||
|
|
||||||
|
enc = tiktoken.get_encoding("cl100k_base")
|
||||||
|
messages = [{"role": "user", "content": text}] * 10
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
for _ in range(50):
|
||||||
|
_ = token_counter(model=model, messages=messages)
|
||||||
|
# enc.encode("".join(m["content"] for m in messages))
|
||||||
|
|
||||||
|
end_time = time.time()
|
||||||
|
|
||||||
|
total_time = end_time - start_time
|
||||||
|
print("model={}, total test time={}".format(model, total_time))
|
||||||
|
assert total_time < 1.5, f"Total encoding time > 1.5s, {total_time}"
|
||||||
|
|
|
@ -54,6 +54,14 @@ os.environ["TIKTOKEN_CACHE_DIR"] = (
|
||||||
)
|
)
|
||||||
|
|
||||||
encoding = tiktoken.get_encoding("cl100k_base")
|
encoding = tiktoken.get_encoding("cl100k_base")
|
||||||
|
from importlib import resources
|
||||||
|
|
||||||
|
with resources.open_text("litellm.llms.tokenizers", "anthropic_tokenizer.json") as f:
|
||||||
|
json_data = json.load(f)
|
||||||
|
# Convert to str (if necessary)
|
||||||
|
json_str = json.dumps(json_data)
|
||||||
|
claude_tokenizer = Tokenizer.from_str(json_str)
|
||||||
|
cohere_tokenizer = Tokenizer.from_pretrained("Xenova/c4ai-command-r-v01-tokenizer")
|
||||||
import importlib.metadata
|
import importlib.metadata
|
||||||
from ._logging import verbose_logger
|
from ._logging import verbose_logger
|
||||||
from .types.router import LiteLLM_Params
|
from .types.router import LiteLLM_Params
|
||||||
|
@ -3848,23 +3856,13 @@ def get_replicate_completion_pricing(completion_response=None, total_time=0.0):
|
||||||
|
|
||||||
@lru_cache(maxsize=128)
|
@lru_cache(maxsize=128)
|
||||||
def _select_tokenizer(model: str):
|
def _select_tokenizer(model: str):
|
||||||
from importlib import resources
|
global claude_tokenizer, cohere_tokenizer
|
||||||
|
if model in litellm.cohere_models and "command-r" in model:
|
||||||
if model in litellm.cohere_models:
|
|
||||||
# cohere
|
# cohere
|
||||||
tokenizer = Tokenizer.from_pretrained("Cohere/command-nightly")
|
return {"type": "huggingface_tokenizer", "tokenizer": cohere_tokenizer}
|
||||||
return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
|
|
||||||
# anthropic
|
# anthropic
|
||||||
elif model in litellm.anthropic_models:
|
elif model in litellm.anthropic_models and "claude-3" not in model:
|
||||||
with resources.open_text(
|
return {"type": "huggingface_tokenizer", "tokenizer": claude_tokenizer}
|
||||||
"litellm.llms.tokenizers", "anthropic_tokenizer.json"
|
|
||||||
) as f:
|
|
||||||
json_data = json.load(f)
|
|
||||||
# Convert to str (if necessary)
|
|
||||||
json_str = json.dumps(json_data)
|
|
||||||
# load tokenizer
|
|
||||||
tokenizer = Tokenizer.from_str(json_str)
|
|
||||||
return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
|
|
||||||
# llama2
|
# llama2
|
||||||
elif "llama-2" in model.lower() or "replicate" in model.lower():
|
elif "llama-2" in model.lower() or "replicate" in model.lower():
|
||||||
tokenizer = Tokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
|
tokenizer = Tokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
|
||||||
|
@ -4170,9 +4168,6 @@ def token_counter(
|
||||||
if model is not None or custom_tokenizer is not None:
|
if model is not None or custom_tokenizer is not None:
|
||||||
tokenizer_json = custom_tokenizer or _select_tokenizer(model=model)
|
tokenizer_json = custom_tokenizer or _select_tokenizer(model=model)
|
||||||
if tokenizer_json["type"] == "huggingface_tokenizer":
|
if tokenizer_json["type"] == "huggingface_tokenizer":
|
||||||
print_verbose(
|
|
||||||
f"Token Counter - using hugging face token counter, for model={model}"
|
|
||||||
)
|
|
||||||
enc = tokenizer_json["tokenizer"].encode(text)
|
enc = tokenizer_json["tokenizer"].encode(text)
|
||||||
num_tokens = len(enc.ids)
|
num_tokens = len(enc.ids)
|
||||||
elif tokenizer_json["type"] == "openai_tokenizer":
|
elif tokenizer_json["type"] == "openai_tokenizer":
|
||||||
|
@ -4207,6 +4202,7 @@ def token_counter(
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
num_tokens = len(encoding.encode(text, disallowed_special=())) # type: ignore
|
num_tokens = len(encoding.encode(text, disallowed_special=())) # type: ignore
|
||||||
|
|
||||||
return num_tokens
|
return num_tokens
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue