From 086981858bbade1201ce5a186b2b1209d1913842 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Mon, 21 Apr 2025 20:40:25 -0700 Subject: [PATCH] fix(vertex_and_google_ai_Studio_gemini.py): add check when thinking disabled allows billing to work correctly Fixes https://github.com/BerriAI/litellm/issues/10121 --- .../vertex_and_google_ai_studio_gemini.py | 29 +++++- litellm/types/utils.py | 9 +- .../test_amazing_vertex_completion.py | 93 +++++++++++++++++++ 3 files changed, 129 insertions(+), 2 deletions(-) diff --git a/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py b/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py index d4c74f4910..3549614422 100644 --- a/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py +++ b/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py @@ -743,6 +743,9 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig): def _calculate_usage( self, completion_response: GenerateContentResponseBody, + is_thinking_enabled: Optional[ + bool + ] = None, # gemini-2.5-flash has thinking enabled by default ) -> Usage: cached_tokens: Optional[int] = None audio_tokens: Optional[int] = None @@ -763,11 +766,13 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig): reasoning_tokens = completion_response["usageMetadata"][ "thoughtsTokenCount" ] + prompt_tokens_details = PromptTokensDetailsWrapper( cached_tokens=cached_tokens, audio_tokens=audio_tokens, text_tokens=text_tokens, ) + ## GET USAGE ## usage = Usage( prompt_tokens=completion_response["usageMetadata"].get( @@ -779,6 +784,7 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig): total_tokens=completion_response["usageMetadata"].get("totalTokenCount", 0), prompt_tokens_details=prompt_tokens_details, reasoning_tokens=reasoning_tokens, + completion_tokens_details={"thinking_enabled": is_thinking_enabled}, ) return usage @@ -849,6 +855,23 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig): return grounding_metadata, safety_ratings, citation_metadata + def _is_thinking_enabled_function(self, optional_params: Dict) -> Optional[bool]: + """ + Returns true if thinking is enabled for the model + """ + thinking_config = cast( + Optional[GeminiThinkingConfig], optional_params.get("thinkingConfig", None) + ) + + if thinking_config is None: + return None + + thinking_budget = thinking_config.get("thinkingBudget") + if thinking_budget == 0: + return False + + return True + def transform_response( self, model: str, @@ -923,7 +946,11 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig): _candidates, model_response, litellm_params ) - usage = self._calculate_usage(completion_response=completion_response) + thinking_enabled = self._is_thinking_enabled_function(optional_params) + usage = self._calculate_usage( + completion_response=completion_response, + is_thinking_enabled=thinking_enabled, + ) setattr(model_response, "usage", usage) ## ADD METADATA TO RESPONSE ## diff --git a/litellm/types/utils.py b/litellm/types/utils.py index 533ffaa64a..65fc63e917 100644 --- a/litellm/types/utils.py +++ b/litellm/types/utils.py @@ -795,6 +795,9 @@ class CompletionTokensDetailsWrapper( CompletionTokensDetails ): # wrapper for older openai versions text_tokens: Optional[int] = None + thinking_enabled: Optional[ + bool + ] = None # for gemini-2.5-flash - this changes how billing is calculated """Text tokens generated by the model.""" @@ -853,7 +856,11 @@ class Usage(CompletionUsage): completion_tokens - reasoning_tokens if completion_tokens else None ) completion_tokens_details = CompletionTokensDetailsWrapper( - reasoning_tokens=reasoning_tokens, text_tokens=text_tokens + **{ + "reasoning_tokens": reasoning_tokens, + "text_tokens": text_tokens, + **completion_tokens_details, + } ) # Ensure completion_tokens_details is properly handled diff --git a/tests/local_testing/test_amazing_vertex_completion.py b/tests/local_testing/test_amazing_vertex_completion.py index ec9b676772..40f98484ae 100644 --- a/tests/local_testing/test_amazing_vertex_completion.py +++ b/tests/local_testing/test_amazing_vertex_completion.py @@ -3521,3 +3521,96 @@ def test_litellm_api_base(monkeypatch, provider, route): mock_client.assert_called() assert mock_client.call_args.kwargs["url"].startswith("https://litellm.com") + + +def vertex_httpx_mock_post_valid_response_with_thinking_enabled(*args, **kwargs): + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.headers = {"Content-Type": "application/json"} + mock_response.json.return_value = { + "candidates": [ + { + "content": { + "role": "model", + "parts": [ + { + "text": "Hello! It's nice to hear from you. How can I help you today?" + } + ] + }, + "finishReason": "STOP", + "avgLogprobs": -6.8490977817111549 + } + ], + "usageMetadata": { + "promptTokenCount": 4, + "candidatesTokenCount": 18, + "totalTokenCount": 278, + "trafficType": "ON_DEMAND", + "promptTokensDetails": [ + { + "modality": "TEXT", + "tokenCount": 4 + } + ], + "candidatesTokensDetails": [ + { + "modality": "TEXT", + "tokenCount": 18 + } + ], + "thoughtsTokenCount": 256 + }, + "modelVersion": "gemini-2.5-flash-preview-04-17", + "createTime": "2025-04-22T03:22:20.094867Z", + "responseId": "bAsHaJPlBcCWm9IP_6inqAk" + } + return mock_response + + + +def test_vertex_ai_gemini_2_5_flash(): + """ + Test that the vertex_ai/gemini-2.5-flash-preview-04-17 model is working correctly + """ + litellm.set_verbose = True + load_vertex_ai_credentials() + from litellm.llms.custom_httpx.http_handler import HTTPHandler + client = HTTPHandler() + + with patch.object(client, "post", side_effect=vertex_httpx_mock_post_valid_response_with_thinking_enabled) as mock_client: + response = completion( + model="vertex_ai/gemini-2.5-flash-preview-04-17", + messages=[{"role": "user", "content": "Hello, world!"}], + client=client, + ) + + mock_client.assert_called() + assert response.usage is not None + assert response.usage.completion_tokens_details.thinking_enabled is None + + + with patch.object(client, "post", side_effect=vertex_httpx_mock_post_valid_response_with_thinking_enabled) as mock_client: + response = completion( + model="vertex_ai/gemini-2.5-flash-preview-04-17", + messages=[{"role": "user", "content": "Hello, world!"}], + thinking={"type": "enabled", "budget_tokens": 1024}, + client=client, + ) + + mock_client.assert_called() + assert response.usage is not None + assert response.usage.completion_tokens_details.thinking_enabled is True + + + # with patch.object(client, "post", side_effect=vertex_httpx_mock_post_valid_response_with_thinking_enabled) as mock_client: + # response = completion( + # model="vertex_ai/gemini-2.5-flash-preview-04-17", + # messages=[{"role": "user", "content": "Hello, world!"}], + # thinking={"type": "enabled", "budget_tokens": 0}, + # client=client, + # ) + + # mock_client.assert_called() + # assert response.usage is not None + # assert response.usage.completion_tokens_details.thinking_enabled is False