fix(vertex_and_google_ai_Studio_gemini.py): add check when thinking disabled

allows billing to work correctly Fixes https://github.com/BerriAI/litellm/issues/10121
2025-04-24 18:24:20 +00:00 · 2025-04-21 20:40:25 -07:00 · 2025-04-21 20:40:25 -07:00 · 086981858b
commit 086981858b
parent 4a50cf10fb
3 changed files with 129 additions and 2 deletions
--- a/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py
+++ b/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py
@ -743,6 +743,9 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
    def _calculate_usage(
        self,
        completion_response: GenerateContentResponseBody,
+        is_thinking_enabled: Optional[
+            bool
+        ] = None,  # gemini-2.5-flash has thinking enabled by default
    ) -> Usage:
        cached_tokens: Optional[int] = None
        audio_tokens: Optional[int] = None
@ -763,11 +766,13 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
            reasoning_tokens = completion_response["usageMetadata"][
                "thoughtsTokenCount"
            ]
+
        prompt_tokens_details = PromptTokensDetailsWrapper(
            cached_tokens=cached_tokens,
            audio_tokens=audio_tokens,
            text_tokens=text_tokens,
        )
+
        ## GET USAGE ##
        usage = Usage(
            prompt_tokens=completion_response["usageMetadata"].get(
@ -779,6 +784,7 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
            total_tokens=completion_response["usageMetadata"].get("totalTokenCount", 0),
            prompt_tokens_details=prompt_tokens_details,
            reasoning_tokens=reasoning_tokens,
+            completion_tokens_details={"thinking_enabled": is_thinking_enabled},
        )

        return usage
@ -849,6 +855,23 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):

        return grounding_metadata, safety_ratings, citation_metadata

+    def _is_thinking_enabled_function(self, optional_params: Dict) -> Optional[bool]:
+        """
+        Returns true if thinking is enabled for the model
+        """
+        thinking_config = cast(
+            Optional[GeminiThinkingConfig], optional_params.get("thinkingConfig", None)
+        )
+
+        if thinking_config is None:
+            return None
+
+        thinking_budget = thinking_config.get("thinkingBudget")
+        if thinking_budget == 0:
+            return False
+
+        return True
+
    def transform_response(
        self,
        model: str,
@ -923,7 +946,11 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
                    _candidates, model_response, litellm_params
                )

-            usage = self._calculate_usage(completion_response=completion_response)
+            thinking_enabled = self._is_thinking_enabled_function(optional_params)
+            usage = self._calculate_usage(
+                completion_response=completion_response,
+                is_thinking_enabled=thinking_enabled,
+            )
            setattr(model_response, "usage", usage)

            ## ADD METADATA TO RESPONSE ##
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@ -795,6 +795,9 @@ class CompletionTokensDetailsWrapper(
    CompletionTokensDetails
 ):  # wrapper for older openai versions
    text_tokens: Optional[int] = None
+    thinking_enabled: Optional[
+        bool
+    ] = None  # for gemini-2.5-flash - this changes how billing is calculated
    """Text tokens generated by the model."""


@ -853,7 +856,11 @@ class Usage(CompletionUsage):
                completion_tokens - reasoning_tokens if completion_tokens else None
            )
            completion_tokens_details = CompletionTokensDetailsWrapper(
-                reasoning_tokens=reasoning_tokens, text_tokens=text_tokens
+                **{
+                    "reasoning_tokens": reasoning_tokens,
+                    "text_tokens": text_tokens,
+                    **completion_tokens_details,
+                }
            )

        # Ensure completion_tokens_details is properly handled
--- a/tests/local_testing/test_amazing_vertex_completion.py
+++ b/tests/local_testing/test_amazing_vertex_completion.py
@ -3521,3 +3521,96 @@ def test_litellm_api_base(monkeypatch, provider, route):

        mock_client.assert_called()
        assert mock_client.call_args.kwargs["url"].startswith("https://litellm.com")
+
+
+def vertex_httpx_mock_post_valid_response_with_thinking_enabled(*args, **kwargs):
+    mock_response = MagicMock()
+    mock_response.status_code = 200
+    mock_response.headers = {"Content-Type": "application/json"}
+    mock_response.json.return_value = {
+        "candidates": [
+            {
+            "content": {
+                "role": "model",
+                "parts": [
+                {
+                    "text": "Hello! It's nice to hear from you. How can I help you today?"
+                }
+                ]
+            },
+            "finishReason": "STOP",
+            "avgLogprobs": -6.8490977817111549
+            }
+        ],
+        "usageMetadata": {
+            "promptTokenCount": 4,
+            "candidatesTokenCount": 18,
+            "totalTokenCount": 278,
+            "trafficType": "ON_DEMAND",
+            "promptTokensDetails": [
+            {
+                "modality": "TEXT",
+                "tokenCount": 4
+            }
+            ],
+            "candidatesTokensDetails": [
+            {
+                "modality": "TEXT",
+                "tokenCount": 18
+            }
+            ],
+            "thoughtsTokenCount": 256
+        },
+        "modelVersion": "gemini-2.5-flash-preview-04-17",
+        "createTime": "2025-04-22T03:22:20.094867Z",
+        "responseId": "bAsHaJPlBcCWm9IP_6inqAk"
+    }
+    return mock_response
+
+
+
+def test_vertex_ai_gemini_2_5_flash():
+    """
+    Test that the vertex_ai/gemini-2.5-flash-preview-04-17 model is working correctly
+    """
+    litellm.set_verbose = True
+    load_vertex_ai_credentials()
+    from litellm.llms.custom_httpx.http_handler import HTTPHandler
+    client = HTTPHandler()
+
+    with patch.object(client, "post", side_effect=vertex_httpx_mock_post_valid_response_with_thinking_enabled) as mock_client:
+        response = completion(
+            model="vertex_ai/gemini-2.5-flash-preview-04-17",
+            messages=[{"role": "user", "content": "Hello, world!"}],
+            client=client,
+        )
+
+        mock_client.assert_called()
+        assert response.usage is not None
+        assert response.usage.completion_tokens_details.thinking_enabled is None
+
+
+    with patch.object(client, "post", side_effect=vertex_httpx_mock_post_valid_response_with_thinking_enabled) as mock_client:
+        response = completion(
+            model="vertex_ai/gemini-2.5-flash-preview-04-17",
+            messages=[{"role": "user", "content": "Hello, world!"}],
+            thinking={"type": "enabled", "budget_tokens": 1024},
+            client=client,
+        )
+
+        mock_client.assert_called()
+        assert response.usage is not None
+        assert response.usage.completion_tokens_details.thinking_enabled is True
+
+
+    # with patch.object(client, "post", side_effect=vertex_httpx_mock_post_valid_response_with_thinking_enabled) as mock_client:
+    #     response = completion(
+    #         model="vertex_ai/gemini-2.5-flash-preview-04-17",
+    #         messages=[{"role": "user", "content": "Hello, world!"}],
+    #         thinking={"type": "enabled", "budget_tokens": 0},
+    #         client=client,
+    #     )
+
+    #     mock_client.assert_called()
+    #     assert response.usage is not None
+    #     assert response.usage.completion_tokens_details.thinking_enabled is False