From 7f04758bcb80d7314c5f743a6315fc6232824489 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 30 Nov 2023 18:22:03 -0800 Subject: [PATCH] (fix) support counting tokens for tool calls --- litellm/utils.py | 52 +++++++++++++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/litellm/utils.py b/litellm/utils.py index 2f07f8dfd..c4d9e7670 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -55,8 +55,13 @@ from .exceptions import ( ) from typing import cast, List, Dict, Union, Optional, Literal from .caching import Cache - +from concurrent.futures import ThreadPoolExecutor ####### ENVIRONMENT VARIABLES #################### +# Adjust to your specific application needs / system capabilities. +MAX_THREADS = 100 + +# Create a ThreadPoolExecutor +executor = ThreadPoolExecutor(max_workers=MAX_THREADS) dotenv.load_dotenv() # Loading env variables using dotenv sentry_sdk_instance = None capture_exception = None @@ -1556,7 +1561,7 @@ def decode(model: str, tokens: List[int]): dec = tokenizer_json["tokenizer"].decode(tokens) return dec -def openai_token_counter(messages, model="gpt-3.5-turbo-0613"): +def openai_token_counter(messages: Optional[list]=None, model="gpt-3.5-turbo-0613", text: Optional[str]= None): """ Return the number of tokens used by a list of messages. @@ -1568,8 +1573,10 @@ def openai_token_counter(messages, model="gpt-3.5-turbo-0613"): print_verbose("Warning: model not found. Using cl100k_base encoding.") encoding = tiktoken.get_encoding("cl100k_base") if model in { + "gpt-3.5-turbo", "gpt-3.5-turbo-0613", "gpt-3.5-turbo-16k-0613", + "gpt-4", "gpt-4-0314", "gpt-4-32k-0314", "gpt-4-0613", @@ -1580,23 +1587,21 @@ def openai_token_counter(messages, model="gpt-3.5-turbo-0613"): elif model == "gpt-3.5-turbo-0301": tokens_per_message = 4 # every message follows <|start|>{role/name}\n{content}<|end|>\n tokens_per_name = -1 # if there's a name, the role is omitted - elif "gpt-3.5-turbo" in model: - print_verbose("Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.") - return openai_token_counter(messages, model="gpt-3.5-turbo-0613") - elif "gpt-4" in model: - print_verbose("Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.") - return openai_token_counter(messages, model="gpt-4-0613") else: raise NotImplementedError( f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""" ) num_tokens = 0 - for message in messages: - num_tokens += tokens_per_message - for key, value in message.items(): - num_tokens += len(encoding.encode(value)) - if key == "name": - num_tokens += tokens_per_name + + if text: + num_tokens = len(encoding.encode(text, disallowed_special=())) + elif messages: + for message in messages: + num_tokens += tokens_per_message + for key, value in message.items(): + num_tokens += len(encoding.encode(value, disallowed_special=())) + if key == "name": + num_tokens += tokens_per_name num_tokens += 3 # every reply is primed with <|start|>assistant<|message|> return num_tokens @@ -1616,19 +1621,26 @@ def token_counter(model="", text=None, messages: Optional[List] = None): if text == None: if messages is not None: print_verbose(f"token_counter messages received: {messages}") - text = "".join([message["content"] for message in messages]) + text = "" + for message in messages: + if message.get("content", None): + text += message["content"] + if 'tool_calls' in message: + for tool_call in message['tool_calls']: + if 'function' in tool_call: + function_arguments = tool_call['function']['arguments'] + text += function_arguments else: raise ValueError("text and messages cannot both be None") num_tokens = 0 - if model is not None: tokenizer_json = _select_tokenizer(model=model) if tokenizer_json["type"] == "huggingface_tokenizer": enc = tokenizer_json["tokenizer"].encode(text) num_tokens = len(enc.ids) elif tokenizer_json["type"] == "openai_tokenizer": - if model in litellm.open_ai_chat_completion_models and messages != None: - num_tokens = openai_token_counter(messages, model=model) + if model in litellm.open_ai_chat_completion_models: + num_tokens = openai_token_counter(text=text, model=model) else: enc = tokenizer_json["tokenizer"].encode(text) num_tokens = len(enc) @@ -4640,7 +4652,8 @@ def safe_crash_reporting(model=None, exception=None, custom_llm_provider=None): "exception": str(exception), "custom_llm_provider": custom_llm_provider, } - threading.Thread(target=litellm_telemetry, args=(data,), daemon=True).start() + executor.submit(litellm_telemetry, data) + # threading.Thread(target=litellm_telemetry, args=(data,), daemon=True).start() def get_or_generate_uuid(): temp_dir = os.path.join(os.path.abspath(os.sep), "tmp") @@ -4707,7 +4720,6 @@ def litellm_telemetry(data): # [Non-Blocking Error] return - ######### Secret Manager ############################ # checks if user has passed in a secret manager client # if passed in then checks the secret there