From 2d43153efa8e779abf9d3fc1ad7d03698d8e2d44 Mon Sep 17 00:00:00 2001 From: Christian Privitelli <40876121+Priva28@users.noreply.github.com> Date: Thu, 2 May 2024 15:49:22 +1000 Subject: [PATCH] include methods in init import, add test, fix encode/decode param ordering --- litellm/__init__.py | 2 ++ litellm/main.py | 2 ++ litellm/tests/test_token_counter.py | 14 +++++++++++--- litellm/tests/test_utils.py | 2 ++ litellm/utils.py | 4 ++-- 5 files changed, 19 insertions(+), 5 deletions(-) diff --git a/litellm/__init__.py b/litellm/__init__.py index a3d61bce1..0ee22da6d 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -608,6 +608,8 @@ from .utils import ( get_optional_params, modify_integration, token_counter, + create_pretrained_tokenizer, + create_tokenizer, cost_per_token, completion_cost, supports_function_calling, diff --git a/litellm/main.py b/litellm/main.py index cdea40d11..b5a986289 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -33,6 +33,8 @@ from litellm.utils import ( async_mock_completion_streaming_obj, convert_to_model_response_object, token_counter, + create_pretrained_tokenizer, + create_tokenizer, Usage, get_optional_params_embeddings, get_optional_params_image_gen, diff --git a/litellm/tests/test_token_counter.py b/litellm/tests/test_token_counter.py index af0db487e..4d759d4cf 100644 --- a/litellm/tests/test_token_counter.py +++ b/litellm/tests/test_token_counter.py @@ -9,7 +9,7 @@ sys.path.insert( 0, os.path.abspath("../..") ) # Adds the parent directory to the system path import time -from litellm import token_counter, encode, decode +from litellm import token_counter, create_pretrained_tokenizer, encode, decode def test_token_counter_normal_plus_function_calling(): @@ -69,15 +69,23 @@ def test_tokenizers(): model="meta-llama/Llama-2-7b-chat", text=sample_text ) + # llama3 tokenizer (also testing custom tokenizer) + llama3_tokens_1 = token_counter(model="meta-llama/llama-3-70b-instruct", text=sample_text) + + llama3_tokenizer = create_pretrained_tokenizer("Xenova/llama-3-tokenizer") + llama3_tokens_2 = token_counter(custom_tokenizer=llama3_tokenizer, text=sample_text) + print( - f"openai tokens: {openai_tokens}; claude tokens: {claude_tokens}; cohere tokens: {cohere_tokens}; llama2 tokens: {llama2_tokens}" + f"openai tokens: {openai_tokens}; claude tokens: {claude_tokens}; cohere tokens: {cohere_tokens}; llama2 tokens: {llama2_tokens}; llama3 tokens: {llama3_tokens_1}" ) # assert that all token values are different assert ( - openai_tokens != cohere_tokens != llama2_tokens + openai_tokens != cohere_tokens != llama2_tokens != llama3_tokens_1 ), "Token values are not different." + assert llama3_tokens_1 == llama3_tokens_2, "Custom tokenizer is not being used! It has been configured to use the same tokenizer as the built in llama3 tokenizer and the results should be the same." + print("test tokenizer: It worked!") except Exception as e: pytest.fail(f"An exception occured: {e}") diff --git a/litellm/tests/test_utils.py b/litellm/tests/test_utils.py index 44fb1607c..57b93df9c 100644 --- a/litellm/tests/test_utils.py +++ b/litellm/tests/test_utils.py @@ -20,6 +20,8 @@ from litellm.utils import ( validate_environment, function_to_dict, token_counter, + create_pretrained_tokenizer, + create_tokenizer, ) # Assuming your trim_messages, shorten_message_to_fit_limit, and get_token_count functions are all in a module named 'message_utils' diff --git a/litellm/utils.py b/litellm/utils.py index eec3a334c..6b1279761 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -3696,7 +3696,7 @@ def _select_tokenizer(model: str): return {"type": "openai_tokenizer", "tokenizer": encoding} -def encode(model: str, custom_tokenizer: Optional[dict] = None, text: str): +def encode(model="", text="", custom_tokenizer: Optional[dict] = None): """ Encodes the given text using the specified model. @@ -3713,7 +3713,7 @@ def encode(model: str, custom_tokenizer: Optional[dict] = None, text: str): return enc -def decode(model: str, custom_tokenizer: Optional[dict] = None, tokens: List[int]): +def decode(model="", tokens: List[int] = [], custom_tokenizer: Optional[dict] = None): tokenizer_json = custom_tokenizer or _select_tokenizer(model=model) dec = tokenizer_json["tokenizer"].decode(tokens) return dec