feat(utils.py): adding encode and decode functions

2025-04-26 03:04:13 +00:00 · 2023-10-20 11:59:32 -07:00 · 2023-10-20 11:59:32 -07:00 · 4eeadd284a
commit 4eeadd284a
parent c038731c48
3 changed files with 80 additions and 31 deletions
--- a/litellm/init.py
+++ b/litellm/init.py
@ -325,7 +325,9 @@ from .utils import (
    check_valid_key,
    get_llm_provider,
    completion_with_config,
-    register_model
+    register_model,
    encode, 
    decode
 )
 from .llms.huggingface_restapi import HuggingfaceConfig
 from .llms.anthropic import AnthropicConfig
--- a/litellm/tests/test_token_counter.py
+++ b/litellm/tests/test_token_counter.py
@ -8,7 +8,7 @@ sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import time
-from litellm import token_counter
+from litellm import token_counter, encode, decode
 def test_tokenizers():
@ -38,4 +38,35 @@ def test_tokenizers():
    except Exception as e: 
        pytest.fail(f'An exception occured: {e}')
-test_tokenizers()
+# test_tokenizers()
 def test_encoding_and_decoding(): 
    try: 
        sample_text = "Hellö World, this is my input string!"
        # openai encoding + decoding
        openai_tokens = encode(model="gpt-3.5-turbo", text=sample_text)
        openai_text = decode(model="gpt-3.5-turbo", tokens=openai_tokens)
        assert openai_text == sample_text
        # claude encoding + decoding 
        claude_tokens = encode(model="claude-instant-1", text=sample_text)
        claude_text = decode(model="claude-instant-1", tokens=claude_tokens.ids)
        assert claude_text == sample_text
        # cohere encoding + decoding 
        cohere_tokens = encode(model="command-nightly", text=sample_text)
        cohere_text = decode(model="command-nightly", tokens=cohere_tokens.ids)
        assert cohere_text == sample_text
        # llama2 encoding + decoding
        llama2_tokens = encode(model="meta-llama/Llama-2-7b-chat", text=sample_text)
        llama2_text = decode(model="meta-llama/Llama-2-7b-chat", tokens=llama2_tokens.ids)
        assert llama2_text == sample_text
    except Exception as e: 
        pytest.fail(f'An exception occured: {e}')
 test_encoding_and_decoding() 
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -869,6 +869,42 @@ def get_replicate_completion_pricing(completion_response=None, total_time=0.0):
    return a100_80gb_price_per_second_public*total_time
 def _select_tokenizer(model: str): 
    # cohere 
    if model in litellm.cohere_models:
        tokenizer = Tokenizer.from_pretrained("Cohere/command-nightly")
        return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
    # anthropic 
    elif model in litellm.anthropic_models:
        # Read the JSON file
        filename = pkg_resources.resource_filename(__name__, 'llms/tokenizers/anthropic_tokenizer.json')
        with open(filename, 'r') as f:
            json_data = json.load(f)
        # Decode the JSON data from utf-8
        json_data_decoded = json.dumps(json_data, ensure_ascii=False)
        # Convert to str
        json_str = str(json_data_decoded)
        # load tokenizer
        tokenizer = Tokenizer.from_str(json_str)
        return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
    # llama2 
    elif "llama-2" in model.lower(): 
        tokenizer = Tokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
        return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
    # default - tiktoken
    else: 
        return {"type": "openai_tokenizer", "tokenizer": encoding}
 def encode(model: str, text: str): 
    tokenizer_json = _select_tokenizer(model=model)
    enc = tokenizer_json["tokenizer"].encode(text)
    return enc
 def decode(model: str, tokens: List[int]): 
    tokenizer_json = _select_tokenizer(model=model)
    dec = tokenizer_json["tokenizer"].decode(tokens)
    return dec
 def token_counter(model="", text=None,  messages: Optional[List] = None):
    """
    Count the number of tokens in a given text using a specified model.
@ -881,42 +917,22 @@ def token_counter(model="", text=None,  messages: Optional[List] = None):
    Returns:
    int: The number of tokens in the text.
    """
-    # use tiktoken or anthropic's tokenizer depending on the model
+    # use tiktoken, anthropic, cohere or llama2's tokenizer depending on the model
    if text == None:
        if messages is not None:
-            text = " ".join([message["content"] for message in messages])
+            text = "".join([message["content"] for message in messages])
        else:
            raise ValueError("text and messages cannot both be None")
    num_tokens = 0
    if model is not None: 
-        # cohere 
+        tokenizer_json = _select_tokenizer(model=model)
-        if model in litellm.cohere_models:
+        if tokenizer_json["type"] == "huggingface_tokenizer": 
-            tokenizer = Tokenizer.from_pretrained("Cohere/command-nightly")
+            enc = tokenizer_json["tokenizer"].encode(text)
            enc = tokenizer.encode(text)
            num_tokens = len(enc.ids)
-        # anthropic 
+        elif tokenizer_json["type"] == "openai_tokenizer": 
-        elif model in litellm.anthropic_models:
+            enc = tokenizer_json["tokenizer"].encode(text)
-            # Read the JSON file
+            num_tokens = len(enc)
            filename = pkg_resources.resource_filename(__name__, 'llms/tokenizers/anthropic_tokenizer.json')
            with open(filename, 'r') as f:
                json_data = json.load(f)
            # Decode the JSON data from utf-8
            json_data_decoded = json.dumps(json_data, ensure_ascii=False)
            # Convert to str
            json_str = str(json_data_decoded)
            # load tokenizer
            tokenizer = Tokenizer.from_str(json_str)
            enc = tokenizer.encode(text)
            num_tokens = len(enc.ids)
        # llama2 
        elif "llama-2" in model.lower(): 
            tokenizer = Tokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
            enc = tokenizer.encode(text)
            num_tokens = len(enc.ids)
        # default - tiktoken
        else: 
            num_tokens = len(encoding.encode(text))
    else:
        num_tokens = len(encoding.encode(text))
    return num_tokens