From 086981858bbade1201ce5a186b2b1209d1913842 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Mon, 21 Apr 2025 20:40:25 -0700
Subject: [PATCH] fix(vertex_and_google_ai_Studio_gemini.py): add check when
 thinking disabled

allows billing to work correctly

Fixes https://github.com/BerriAI/litellm/issues/10121
---
 .../vertex_and_google_ai_studio_gemini.py     | 29 +++++-
 litellm/types/utils.py                        |  9 +-
 .../test_amazing_vertex_completion.py         | 93 +++++++++++++++++++
 3 files changed, 129 insertions(+), 2 deletions(-)

diff --git a/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py b/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py
index d4c74f4910..3549614422 100644
--- a/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py
+++ b/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py
@@ -743,6 +743,9 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
     def _calculate_usage(
         self,
         completion_response: GenerateContentResponseBody,
+        is_thinking_enabled: Optional[
+            bool
+        ] = None,  # gemini-2.5-flash has thinking enabled by default
     ) -> Usage:
         cached_tokens: Optional[int] = None
         audio_tokens: Optional[int] = None
@@ -763,11 +766,13 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
             reasoning_tokens = completion_response["usageMetadata"][
                 "thoughtsTokenCount"
             ]
+
         prompt_tokens_details = PromptTokensDetailsWrapper(
             cached_tokens=cached_tokens,
             audio_tokens=audio_tokens,
             text_tokens=text_tokens,
         )
+
         ## GET USAGE ##
         usage = Usage(
             prompt_tokens=completion_response["usageMetadata"].get(
@@ -779,6 +784,7 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
             total_tokens=completion_response["usageMetadata"].get("totalTokenCount", 0),
             prompt_tokens_details=prompt_tokens_details,
             reasoning_tokens=reasoning_tokens,
+            completion_tokens_details={"thinking_enabled": is_thinking_enabled},
         )
 
         return usage
@@ -849,6 +855,23 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
 
         return grounding_metadata, safety_ratings, citation_metadata
 
+    def _is_thinking_enabled_function(self, optional_params: Dict) -> Optional[bool]:
+        """
+        Returns true if thinking is enabled for the model
+        """
+        thinking_config = cast(
+            Optional[GeminiThinkingConfig], optional_params.get("thinkingConfig", None)
+        )
+
+        if thinking_config is None:
+            return None
+
+        thinking_budget = thinking_config.get("thinkingBudget")
+        if thinking_budget == 0:
+            return False
+
+        return True
+
     def transform_response(
         self,
         model: str,
@@ -923,7 +946,11 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
                     _candidates, model_response, litellm_params
                 )
 
-            usage = self._calculate_usage(completion_response=completion_response)
+            thinking_enabled = self._is_thinking_enabled_function(optional_params)
+            usage = self._calculate_usage(
+                completion_response=completion_response,
+                is_thinking_enabled=thinking_enabled,
+            )
             setattr(model_response, "usage", usage)
 
             ## ADD METADATA TO RESPONSE ##
diff --git a/litellm/types/utils.py b/litellm/types/utils.py
index 533ffaa64a..65fc63e917 100644
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@@ -795,6 +795,9 @@ class CompletionTokensDetailsWrapper(
     CompletionTokensDetails
 ):  # wrapper for older openai versions
     text_tokens: Optional[int] = None
+    thinking_enabled: Optional[
+        bool
+    ] = None  # for gemini-2.5-flash - this changes how billing is calculated
     """Text tokens generated by the model."""
 
 
@@ -853,7 +856,11 @@ class Usage(CompletionUsage):
                 completion_tokens - reasoning_tokens if completion_tokens else None
             )
             completion_tokens_details = CompletionTokensDetailsWrapper(
-                reasoning_tokens=reasoning_tokens, text_tokens=text_tokens
+                **{
+                    "reasoning_tokens": reasoning_tokens,
+                    "text_tokens": text_tokens,
+                    **completion_tokens_details,
+                }
             )
 
         # Ensure completion_tokens_details is properly handled
diff --git a/tests/local_testing/test_amazing_vertex_completion.py b/tests/local_testing/test_amazing_vertex_completion.py
index ec9b676772..40f98484ae 100644
--- a/tests/local_testing/test_amazing_vertex_completion.py
+++ b/tests/local_testing/test_amazing_vertex_completion.py
@@ -3521,3 +3521,96 @@ def test_litellm_api_base(monkeypatch, provider, route):
 
         mock_client.assert_called()
         assert mock_client.call_args.kwargs["url"].startswith("https://litellm.com")
+
+
+def vertex_httpx_mock_post_valid_response_with_thinking_enabled(*args, **kwargs):
+    mock_response = MagicMock()
+    mock_response.status_code = 200
+    mock_response.headers = {"Content-Type": "application/json"}
+    mock_response.json.return_value = {
+        "candidates": [
+            {
+            "content": {
+                "role": "model",
+                "parts": [
+                {
+                    "text": "Hello! It's nice to hear from you. How can I help you today?"
+                }
+                ]
+            },
+            "finishReason": "STOP",
+            "avgLogprobs": -6.8490977817111549
+            }
+        ],
+        "usageMetadata": {
+            "promptTokenCount": 4,
+            "candidatesTokenCount": 18,
+            "totalTokenCount": 278,
+            "trafficType": "ON_DEMAND",
+            "promptTokensDetails": [
+            {
+                "modality": "TEXT",
+                "tokenCount": 4
+            }
+            ],
+            "candidatesTokensDetails": [
+            {
+                "modality": "TEXT",
+                "tokenCount": 18
+            }
+            ],
+            "thoughtsTokenCount": 256
+        },
+        "modelVersion": "gemini-2.5-flash-preview-04-17",
+        "createTime": "2025-04-22T03:22:20.094867Z",
+        "responseId": "bAsHaJPlBcCWm9IP_6inqAk"
+    }
+    return mock_response
+
+
+
+def test_vertex_ai_gemini_2_5_flash():
+    """
+    Test that the vertex_ai/gemini-2.5-flash-preview-04-17 model is working correctly
+    """
+    litellm.set_verbose = True
+    load_vertex_ai_credentials()
+    from litellm.llms.custom_httpx.http_handler import HTTPHandler
+    client = HTTPHandler()
+
+    with patch.object(client, "post", side_effect=vertex_httpx_mock_post_valid_response_with_thinking_enabled) as mock_client:
+        response = completion(
+            model="vertex_ai/gemini-2.5-flash-preview-04-17",
+            messages=[{"role": "user", "content": "Hello, world!"}],
+            client=client,
+        )
+
+        mock_client.assert_called()
+        assert response.usage is not None
+        assert response.usage.completion_tokens_details.thinking_enabled is None
+
+
+    with patch.object(client, "post", side_effect=vertex_httpx_mock_post_valid_response_with_thinking_enabled) as mock_client:
+        response = completion(
+            model="vertex_ai/gemini-2.5-flash-preview-04-17",
+            messages=[{"role": "user", "content": "Hello, world!"}],
+            thinking={"type": "enabled", "budget_tokens": 1024},
+            client=client,
+        )
+
+        mock_client.assert_called()
+        assert response.usage is not None
+        assert response.usage.completion_tokens_details.thinking_enabled is True
+
+
+    # with patch.object(client, "post", side_effect=vertex_httpx_mock_post_valid_response_with_thinking_enabled) as mock_client:
+    #     response = completion(
+    #         model="vertex_ai/gemini-2.5-flash-preview-04-17",
+    #         messages=[{"role": "user", "content": "Hello, world!"}],
+    #         thinking={"type": "enabled", "budget_tokens": 0},
+    #         client=client,
+    #     )
+
+    #     mock_client.assert_called()
+    #     assert response.usage is not None
+    #     assert response.usage.completion_tokens_details.thinking_enabled is False