diff --git a/litellm/main.py b/litellm/main.py index f5f588f49..f54f70a21 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -144,8 +144,10 @@ from .types.llms.openai import HttpxBinaryResponseContent from .types.utils import ( AdapterCompletionStreamWrapper, ChatCompletionMessageToolCall, + CompletionTokensDetails, FileTypes, HiddenParams, + PromptTokensDetails, all_litellm_params, ) @@ -5481,7 +5483,13 @@ def stream_chunk_builder( chunks=chunks, messages=messages ) role = chunks[0]["choices"][0]["delta"]["role"] - finish_reason = chunks[-1]["choices"][0]["finish_reason"] + finish_reason = "stop" + for chunk in chunks: + if "choices" in chunk and len(chunk["choices"]) > 0: + if hasattr(chunk["choices"][0], "finish_reason"): + finish_reason = chunk["choices"][0].finish_reason + elif "finish_reason" in chunk["choices"][0]: + finish_reason = chunk["choices"][0]["finish_reason"] # Initialize the response dictionary response = { @@ -5512,7 +5520,8 @@ def stream_chunk_builder( tool_call_chunks = [ chunk for chunk in chunks - if "tool_calls" in chunk["choices"][0]["delta"] + if len(chunk["choices"]) > 0 + and "tool_calls" in chunk["choices"][0]["delta"] and chunk["choices"][0]["delta"]["tool_calls"] is not None ] @@ -5590,7 +5599,8 @@ def stream_chunk_builder( function_call_chunks = [ chunk for chunk in chunks - if "function_call" in chunk["choices"][0]["delta"] + if len(chunk["choices"]) > 0 + and "function_call" in chunk["choices"][0]["delta"] and chunk["choices"][0]["delta"]["function_call"] is not None ] @@ -5625,7 +5635,8 @@ def stream_chunk_builder( content_chunks = [ chunk for chunk in chunks - if "content" in chunk["choices"][0]["delta"] + if len(chunk["choices"]) > 0 + and "content" in chunk["choices"][0]["delta"] and chunk["choices"][0]["delta"]["content"] is not None ] @@ -5657,6 +5668,8 @@ def stream_chunk_builder( ## anthropic prompt caching information ## cache_creation_input_tokens: Optional[int] = None cache_read_input_tokens: Optional[int] = None + completion_tokens_details: Optional[CompletionTokensDetails] = None + prompt_tokens_details: Optional[PromptTokensDetails] = None for chunk in chunks: usage_chunk: Optional[Usage] = None if "usage" in chunk: @@ -5674,6 +5687,26 @@ def stream_chunk_builder( ) if "cache_read_input_tokens" in usage_chunk: cache_read_input_tokens = usage_chunk.get("cache_read_input_tokens") + if hasattr(usage_chunk, "completion_tokens_details"): + if isinstance(usage_chunk.completion_tokens_details, dict): + completion_tokens_details = CompletionTokensDetails( + **usage_chunk.completion_tokens_details + ) + elif isinstance( + usage_chunk.completion_tokens_details, CompletionTokensDetails + ): + completion_tokens_details = ( + usage_chunk.completion_tokens_details + ) + if hasattr(usage_chunk, "prompt_tokens_details"): + if isinstance(usage_chunk.prompt_tokens_details, dict): + prompt_tokens_details = PromptTokensDetails( + **usage_chunk.prompt_tokens_details + ) + elif isinstance( + usage_chunk.prompt_tokens_details, PromptTokensDetails + ): + prompt_tokens_details = usage_chunk.prompt_tokens_details try: response["usage"]["prompt_tokens"] = prompt_tokens or token_counter( @@ -5700,6 +5733,11 @@ def stream_chunk_builder( if cache_read_input_tokens is not None: response["usage"]["cache_read_input_tokens"] = cache_read_input_tokens + if completion_tokens_details is not None: + response["usage"]["completion_tokens_details"] = completion_tokens_details + if prompt_tokens_details is not None: + response["usage"]["prompt_tokens_details"] = prompt_tokens_details + return convert_to_model_response_object( response_object=response, model_response_object=model_response, diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml index 5f36a359e..f5df0a282 100644 --- a/litellm/proxy/_new_secret_config.yaml +++ b/litellm/proxy/_new_secret_config.yaml @@ -1,5 +1,5 @@ model_list: - - model_name: gpt-4o-realtime-audio + - model_name: gpt-4o litellm_params: model: azure/gpt-4o-realtime-preview api_key: os.environ/AZURE_SWEDEN_API_KEY diff --git a/litellm/types/utils.py b/litellm/types/utils.py index ab4f27669..c8adf0bdc 100644 --- a/litellm/types/utils.py +++ b/litellm/types/utils.py @@ -11,7 +11,7 @@ from openai.types.completion_usage import ( CompletionUsage, PromptTokensDetails, ) -from pydantic import ConfigDict, PrivateAttr +from pydantic import BaseModel, ConfigDict, PrivateAttr from typing_extensions import Callable, Dict, Required, TypedDict, override from ..litellm_core_utils.core_helpers import map_finish_reason @@ -677,6 +677,8 @@ class ModelResponse(OpenAIObject): _new_choice = choice elif isinstance(choice, dict): _new_choice = StreamingChoices(**choice) + elif isinstance(choice, BaseModel): + _new_choice = StreamingChoices(**choice.model_dump()) new_choices.append(_new_choice) choices = new_choices else: diff --git a/litellm/utils.py b/litellm/utils.py index 2e8ec7c49..7fcc6e6e1 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -7813,9 +7813,7 @@ class CustomStreamWrapper: ) elif isinstance(response_obj["usage"], BaseModel): model_response.usage = litellm.Usage( - prompt_tokens=response_obj["usage"].prompt_tokens, - completion_tokens=response_obj["usage"].completion_tokens, - total_tokens=response_obj["usage"].total_tokens, + **response_obj["usage"].model_dump() ) model_response.model = self.model diff --git a/tests/local_testing/test_stream_chunk_builder.py b/tests/local_testing/test_stream_chunk_builder.py index 35ce34c0c..ca3ea0e02 100644 --- a/tests/local_testing/test_stream_chunk_builder.py +++ b/tests/local_testing/test_stream_chunk_builder.py @@ -5,6 +5,7 @@ import time import traceback import pytest +from typing import List sys.path.insert( 0, os.path.abspath("../..") @@ -12,7 +13,6 @@ sys.path.insert( import os import dotenv -import pytest from openai import OpenAI import litellm @@ -622,3 +622,46 @@ def test_stream_chunk_builder_multiple_tool_calls(): assert ( expected_response.choices == response.choices ), "\nGot={}\n, Expected={}\n".format(response.choices, expected_response.choices) + + +def test_stream_chunk_builder_openai_prompt_caching(): + from openai import OpenAI + from pydantic import BaseModel + + client = OpenAI( + # This is the default and can be omitted + api_key=os.getenv("OPENAI_API_KEY"), + ) + + chat_completion = client.chat.completions.create( + messages=[ + { + "role": "user", + "content": "Say this is a test", + } + ], + model="gpt-3.5-turbo", + stream=True, + stream_options={"include_usage": True}, + ) + chunks: List[litellm.ModelResponse] = [] + usage_obj = None + for chunk in chat_completion: + chunks.append(litellm.ModelResponse(**chunk.model_dump(), stream=True)) + + print(f"chunks: {chunks}") + + usage_obj: litellm.Usage = chunks[-1].usage # type: ignore + + response = stream_chunk_builder(chunks=chunks) + print(f"response: {response}") + print(f"response usage: {response.usage}") + for k, v in usage_obj.model_dump().items(): + print(k, v) + response_usage_value = getattr(response.usage, k) # type: ignore + print(f"response_usage_value: {response_usage_value}") + print(f"type: {type(response_usage_value)}") + if isinstance(response_usage_value, BaseModel): + assert response_usage_value.model_dump() == v + else: + assert response_usage_value == v