diff --git a/litellm/main.py b/litellm/main.py index ea6b57154..a487563ba 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -3268,7 +3268,9 @@ def stream_chunk_builder(chunks: list, messages: Optional[list] = None): print_verbose(f"token_counter failed, assuming prompt tokens is 0") response["usage"]["prompt_tokens"] = 0 response["usage"]["completion_tokens"] = token_counter( - model=model, text=completion_output + model=model, + text=combined_content, + count_response_tokens=True, # count_response_tokens is a Flag to tell token counter this is a response, No need to add extra tokens we do for input messages ) response["usage"]["total_tokens"] = ( response["usage"]["prompt_tokens"] + response["usage"]["completion_tokens"] diff --git a/litellm/utils.py b/litellm/utils.py index 88bcd078f..6882a2a1f 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -2460,12 +2460,16 @@ def openai_token_counter( model="gpt-3.5-turbo-0613", text: Optional[str] = None, is_tool_call: Optional[bool] = False, + count_response_tokens: Optional[ + bool + ] = False, # Flag passed from litellm.stream_chunk_builder, to indicate counting tokens for LLM Response. We need this because for LLM input we add +3 tokens per message - based on OpenAI's token counter ): """ Return the number of tokens used by a list of messages. Borrowed from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb. """ + print_verbose(f"LiteLLM: Utils - Counting tokens for OpenAI model={model}") try: encoding = tiktoken.encoding_for_model(model) except KeyError: @@ -2516,8 +2520,10 @@ def openai_token_counter( num_tokens += calculage_img_tokens( data=image_url_str, mode="auto" ) - elif text is not None: + elif text is not None and count_response_tokens == True: + # This is the case where we need to count tokens for a streamed response. We should NOT add +3 tokens per message in this branch num_tokens = len(encoding.encode(text, disallowed_special=())) + return num_tokens num_tokens += 3 # every reply is primed with <|start|>assistant<|message|> return num_tokens @@ -2620,6 +2626,7 @@ def token_counter( model="", text: Optional[Union[str, List[str]]] = None, messages: Optional[List] = None, + count_response_tokens: Optional[bool] = False, ): """ Count the number of tokens in a given text using a specified model. @@ -2683,7 +2690,11 @@ def token_counter( or model in litellm.azure_llms ): num_tokens = openai_token_counter( - text=text, model=model, messages=messages, is_tool_call=is_tool_call # type: ignore + text=text, # type: ignore + model=model, + messages=messages, + is_tool_call=is_tool_call, + count_response_tokens=count_response_tokens, ) else: enc = tokenizer_json["tokenizer"].encode(text)