fix(utils.py): default claude-3 to tiktoken (0.8s faster than hf tokenizer)

This commit is contained in:
Krrish Dholakia 2024-05-14 18:36:34 -07:00
parent 46a81524ab
commit a1dd341ca1
2 changed files with 57 additions and 21 deletions

View file

@ -10,6 +10,7 @@ sys.path.insert(
) # Adds the parent directory to the system path ) # Adds the parent directory to the system path
import time import time
from litellm import token_counter, create_pretrained_tokenizer, encode, decode from litellm import token_counter, create_pretrained_tokenizer, encode, decode
from litellm.tests.large_text import text
def test_token_counter_normal_plus_function_calling(): def test_token_counter_normal_plus_function_calling():
@ -70,10 +71,14 @@ def test_tokenizers():
) )
# llama3 tokenizer (also testing custom tokenizer) # llama3 tokenizer (also testing custom tokenizer)
llama3_tokens_1 = token_counter(model="meta-llama/llama-3-70b-instruct", text=sample_text) llama3_tokens_1 = token_counter(
model="meta-llama/llama-3-70b-instruct", text=sample_text
)
llama3_tokenizer = create_pretrained_tokenizer("Xenova/llama-3-tokenizer") llama3_tokenizer = create_pretrained_tokenizer("Xenova/llama-3-tokenizer")
llama3_tokens_2 = token_counter(custom_tokenizer=llama3_tokenizer, text=sample_text) llama3_tokens_2 = token_counter(
custom_tokenizer=llama3_tokenizer, text=sample_text
)
print( print(
f"openai tokens: {openai_tokens}; claude tokens: {claude_tokens}; cohere tokens: {cohere_tokens}; llama2 tokens: {llama2_tokens}; llama3 tokens: {llama3_tokens_1}" f"openai tokens: {openai_tokens}; claude tokens: {claude_tokens}; cohere tokens: {cohere_tokens}; llama2 tokens: {llama2_tokens}; llama3 tokens: {llama3_tokens_1}"
@ -84,7 +89,9 @@ def test_tokenizers():
openai_tokens != cohere_tokens != llama2_tokens != llama3_tokens_1 openai_tokens != cohere_tokens != llama2_tokens != llama3_tokens_1
), "Token values are not different." ), "Token values are not different."
assert llama3_tokens_1 == llama3_tokens_2, "Custom tokenizer is not being used! It has been configured to use the same tokenizer as the built in llama3 tokenizer and the results should be the same." assert (
llama3_tokens_1 == llama3_tokens_2
), "Custom tokenizer is not being used! It has been configured to use the same tokenizer as the built in llama3 tokenizer and the results should be the same."
print("test tokenizer: It worked!") print("test tokenizer: It worked!")
except Exception as e: except Exception as e:
@ -147,3 +154,36 @@ def test_gpt_vision_token_counting():
# test_gpt_vision_token_counting() # test_gpt_vision_token_counting()
@pytest.mark.parametrize(
"model",
[
"gpt-4-vision-preview",
"gpt-4o",
"claude-3-opus-20240229",
"command-nightly",
"mistral/mistral-tiny",
],
)
def test_load_test_token_counter(model):
"""
Token count large prompt 100 times.
Assert time taken is < 1.5s.
"""
import tiktoken
enc = tiktoken.get_encoding("cl100k_base")
messages = [{"role": "user", "content": text}] * 10
start_time = time.time()
for _ in range(50):
_ = token_counter(model=model, messages=messages)
# enc.encode("".join(m["content"] for m in messages))
end_time = time.time()
total_time = end_time - start_time
print("model={}, total test time={}".format(model, total_time))
assert total_time < 1.5, f"Total encoding time > 1.5s, {total_time}"

View file

@ -54,6 +54,14 @@ os.environ["TIKTOKEN_CACHE_DIR"] = (
) )
encoding = tiktoken.get_encoding("cl100k_base") encoding = tiktoken.get_encoding("cl100k_base")
from importlib import resources
with resources.open_text("litellm.llms.tokenizers", "anthropic_tokenizer.json") as f:
json_data = json.load(f)
# Convert to str (if necessary)
json_str = json.dumps(json_data)
claude_tokenizer = Tokenizer.from_str(json_str)
cohere_tokenizer = Tokenizer.from_pretrained("Xenova/c4ai-command-r-v01-tokenizer")
import importlib.metadata import importlib.metadata
from ._logging import verbose_logger from ._logging import verbose_logger
from .types.router import LiteLLM_Params from .types.router import LiteLLM_Params
@ -3848,23 +3856,13 @@ def get_replicate_completion_pricing(completion_response=None, total_time=0.0):
@lru_cache(maxsize=128) @lru_cache(maxsize=128)
def _select_tokenizer(model: str): def _select_tokenizer(model: str):
from importlib import resources global claude_tokenizer, cohere_tokenizer
if model in litellm.cohere_models and "command-r" in model:
if model in litellm.cohere_models:
# cohere # cohere
tokenizer = Tokenizer.from_pretrained("Cohere/command-nightly") return {"type": "huggingface_tokenizer", "tokenizer": cohere_tokenizer}
return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
# anthropic # anthropic
elif model in litellm.anthropic_models: elif model in litellm.anthropic_models and "claude-3" not in model:
with resources.open_text( return {"type": "huggingface_tokenizer", "tokenizer": claude_tokenizer}
"litellm.llms.tokenizers", "anthropic_tokenizer.json"
) as f:
json_data = json.load(f)
# Convert to str (if necessary)
json_str = json.dumps(json_data)
# load tokenizer
tokenizer = Tokenizer.from_str(json_str)
return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
# llama2 # llama2
elif "llama-2" in model.lower() or "replicate" in model.lower(): elif "llama-2" in model.lower() or "replicate" in model.lower():
tokenizer = Tokenizer.from_pretrained("hf-internal-testing/llama-tokenizer") tokenizer = Tokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
@ -4170,9 +4168,6 @@ def token_counter(
if model is not None or custom_tokenizer is not None: if model is not None or custom_tokenizer is not None:
tokenizer_json = custom_tokenizer or _select_tokenizer(model=model) tokenizer_json = custom_tokenizer or _select_tokenizer(model=model)
if tokenizer_json["type"] == "huggingface_tokenizer": if tokenizer_json["type"] == "huggingface_tokenizer":
print_verbose(
f"Token Counter - using hugging face token counter, for model={model}"
)
enc = tokenizer_json["tokenizer"].encode(text) enc = tokenizer_json["tokenizer"].encode(text)
num_tokens = len(enc.ids) num_tokens = len(enc.ids)
elif tokenizer_json["type"] == "openai_tokenizer": elif tokenizer_json["type"] == "openai_tokenizer":
@ -4207,6 +4202,7 @@ def token_counter(
) )
else: else:
num_tokens = len(encoding.encode(text, disallowed_special=())) # type: ignore num_tokens = len(encoding.encode(text, disallowed_special=())) # type: ignore
return num_tokens return num_tokens