diff --git a/litellm/tests/test_token_counter.py b/litellm/tests/test_token_counter.py index 4d759d4cff..50734f7cc2 100644 --- a/litellm/tests/test_token_counter.py +++ b/litellm/tests/test_token_counter.py @@ -10,6 +10,7 @@ sys.path.insert( ) # Adds the parent directory to the system path import time from litellm import token_counter, create_pretrained_tokenizer, encode, decode +from litellm.tests.large_text import text def test_token_counter_normal_plus_function_calling(): @@ -70,10 +71,14 @@ def test_tokenizers(): ) # llama3 tokenizer (also testing custom tokenizer) - llama3_tokens_1 = token_counter(model="meta-llama/llama-3-70b-instruct", text=sample_text) + llama3_tokens_1 = token_counter( + model="meta-llama/llama-3-70b-instruct", text=sample_text + ) llama3_tokenizer = create_pretrained_tokenizer("Xenova/llama-3-tokenizer") - llama3_tokens_2 = token_counter(custom_tokenizer=llama3_tokenizer, text=sample_text) + llama3_tokens_2 = token_counter( + custom_tokenizer=llama3_tokenizer, text=sample_text + ) print( f"openai tokens: {openai_tokens}; claude tokens: {claude_tokens}; cohere tokens: {cohere_tokens}; llama2 tokens: {llama2_tokens}; llama3 tokens: {llama3_tokens_1}" @@ -84,7 +89,9 @@ def test_tokenizers(): openai_tokens != cohere_tokens != llama2_tokens != llama3_tokens_1 ), "Token values are not different." - assert llama3_tokens_1 == llama3_tokens_2, "Custom tokenizer is not being used! It has been configured to use the same tokenizer as the built in llama3 tokenizer and the results should be the same." + assert ( + llama3_tokens_1 == llama3_tokens_2 + ), "Custom tokenizer is not being used! It has been configured to use the same tokenizer as the built in llama3 tokenizer and the results should be the same." print("test tokenizer: It worked!") except Exception as e: @@ -147,3 +154,36 @@ def test_gpt_vision_token_counting(): # test_gpt_vision_token_counting() + + +@pytest.mark.parametrize( + "model", + [ + "gpt-4-vision-preview", + "gpt-4o", + "claude-3-opus-20240229", + "command-nightly", + "mistral/mistral-tiny", + ], +) +def test_load_test_token_counter(model): + """ + Token count large prompt 100 times. + + Assert time taken is < 1.5s. + """ + import tiktoken + + enc = tiktoken.get_encoding("cl100k_base") + messages = [{"role": "user", "content": text}] * 10 + + start_time = time.time() + for _ in range(50): + _ = token_counter(model=model, messages=messages) + # enc.encode("".join(m["content"] for m in messages)) + + end_time = time.time() + + total_time = end_time - start_time + print("model={}, total test time={}".format(model, total_time)) + assert total_time < 1.5, f"Total encoding time > 1.5s, {total_time}" diff --git a/litellm/utils.py b/litellm/utils.py index 00492a7d99..00f39cc013 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -54,6 +54,14 @@ os.environ["TIKTOKEN_CACHE_DIR"] = ( ) encoding = tiktoken.get_encoding("cl100k_base") +from importlib import resources + +with resources.open_text("litellm.llms.tokenizers", "anthropic_tokenizer.json") as f: + json_data = json.load(f) +# Convert to str (if necessary) +json_str = json.dumps(json_data) +claude_tokenizer = Tokenizer.from_str(json_str) +cohere_tokenizer = Tokenizer.from_pretrained("Xenova/c4ai-command-r-v01-tokenizer") import importlib.metadata from ._logging import verbose_logger from .types.router import LiteLLM_Params @@ -3848,23 +3856,13 @@ def get_replicate_completion_pricing(completion_response=None, total_time=0.0): @lru_cache(maxsize=128) def _select_tokenizer(model: str): - from importlib import resources - - if model in litellm.cohere_models: + global claude_tokenizer, cohere_tokenizer + if model in litellm.cohere_models and "command-r" in model: # cohere - tokenizer = Tokenizer.from_pretrained("Cohere/command-nightly") - return {"type": "huggingface_tokenizer", "tokenizer": tokenizer} + return {"type": "huggingface_tokenizer", "tokenizer": cohere_tokenizer} # anthropic - elif model in litellm.anthropic_models: - with resources.open_text( - "litellm.llms.tokenizers", "anthropic_tokenizer.json" - ) as f: - json_data = json.load(f) - # Convert to str (if necessary) - json_str = json.dumps(json_data) - # load tokenizer - tokenizer = Tokenizer.from_str(json_str) - return {"type": "huggingface_tokenizer", "tokenizer": tokenizer} + elif model in litellm.anthropic_models and "claude-3" not in model: + return {"type": "huggingface_tokenizer", "tokenizer": claude_tokenizer} # llama2 elif "llama-2" in model.lower() or "replicate" in model.lower(): tokenizer = Tokenizer.from_pretrained("hf-internal-testing/llama-tokenizer") @@ -4170,9 +4168,6 @@ def token_counter( if model is not None or custom_tokenizer is not None: tokenizer_json = custom_tokenizer or _select_tokenizer(model=model) if tokenizer_json["type"] == "huggingface_tokenizer": - print_verbose( - f"Token Counter - using hugging face token counter, for model={model}" - ) enc = tokenizer_json["tokenizer"].encode(text) num_tokens = len(enc.ids) elif tokenizer_json["type"] == "openai_tokenizer": @@ -4207,6 +4202,7 @@ def token_counter( ) else: num_tokens = len(encoding.encode(text, disallowed_special=())) # type: ignore + return num_tokens