fix(vertex_and_google_ai_Studio_gemini.py): add check when thinking disabled

allows billing to work correctly Fixes https://github.com/BerriAI/litellm/issues/10121
2025-04-24 18:24:20 +00:00 · 2025-04-21 20:40:25 -07:00 · 2025-04-21 20:40:25 -07:00 · 086981858b
commit 086981858b
parent 4a50cf10fb
3 changed files with 129 additions and 2 deletions
--- a/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py
+++ b/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py
@ -743,6 +743,9 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
    def _calculate_usage(
        self,
        completion_response: GenerateContentResponseBody,
        is_thinking_enabled: Optional[
            bool
        ] = None,  # gemini-2.5-flash has thinking enabled by default
    ) -> Usage:
        cached_tokens: Optional[int] = None
        audio_tokens: Optional[int] = None
@ -763,11 +766,13 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
            reasoning_tokens = completion_response["usageMetadata"][
                "thoughtsTokenCount"
            ]
        prompt_tokens_details = PromptTokensDetailsWrapper(
            cached_tokens=cached_tokens,
            audio_tokens=audio_tokens,
            text_tokens=text_tokens,
        )
        ## GET USAGE ##
        usage = Usage(
            prompt_tokens=completion_response["usageMetadata"].get(
@ -779,6 +784,7 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
            total_tokens=completion_response["usageMetadata"].get("totalTokenCount", 0),
            prompt_tokens_details=prompt_tokens_details,
            reasoning_tokens=reasoning_tokens,
            completion_tokens_details={"thinking_enabled": is_thinking_enabled},
        )
        return usage
@ -849,6 +855,23 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
        return grounding_metadata, safety_ratings, citation_metadata
    def _is_thinking_enabled_function(self, optional_params: Dict) -> Optional[bool]:
        """
        Returns true if thinking is enabled for the model
        """
        thinking_config = cast(
            Optional[GeminiThinkingConfig], optional_params.get("thinkingConfig", None)
        )
        if thinking_config is None:
            return None
        thinking_budget = thinking_config.get("thinkingBudget")
        if thinking_budget == 0:
            return False
        return True
    def transform_response(
        self,
        model: str,
@ -923,7 +946,11 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
                    _candidates, model_response, litellm_params
                )
-            usage = self._calculate_usage(completion_response=completion_response)
+            thinking_enabled = self._is_thinking_enabled_function(optional_params)
            usage = self._calculate_usage(
                completion_response=completion_response,
                is_thinking_enabled=thinking_enabled,
            )
            setattr(model_response, "usage", usage)
            ## ADD METADATA TO RESPONSE ##
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@ -795,6 +795,9 @@ class CompletionTokensDetailsWrapper(
    CompletionTokensDetails
 ):  # wrapper for older openai versions
    text_tokens: Optional[int] = None
    thinking_enabled: Optional[
        bool
    ] = None  # for gemini-2.5-flash - this changes how billing is calculated
    """Text tokens generated by the model."""
@ -853,7 +856,11 @@ class Usage(CompletionUsage):
                completion_tokens - reasoning_tokens if completion_tokens else None
            )
            completion_tokens_details = CompletionTokensDetailsWrapper(
-                reasoning_tokens=reasoning_tokens, text_tokens=text_tokens
+                **{
                    "reasoning_tokens": reasoning_tokens,
                    "text_tokens": text_tokens,
                    **completion_tokens_details,
                }
            )
        # Ensure completion_tokens_details is properly handled
--- a/tests/local_testing/test_amazing_vertex_completion.py
+++ b/tests/local_testing/test_amazing_vertex_completion.py
@ -3521,3 +3521,96 @@ def test_litellm_api_base(monkeypatch, provider, route):
        mock_client.assert_called()
        assert mock_client.call_args.kwargs["url"].startswith("https://litellm.com")
 def vertex_httpx_mock_post_valid_response_with_thinking_enabled(*args, **kwargs):
    mock_response = MagicMock()
    mock_response.status_code = 200
    mock_response.headers = {"Content-Type": "application/json"}
    mock_response.json.return_value = {
        "candidates": [
            {
            "content": {
                "role": "model",
                "parts": [
                {
                    "text": "Hello! It's nice to hear from you. How can I help you today?"
                }
                ]
            },
            "finishReason": "STOP",
            "avgLogprobs": -6.8490977817111549
            }
        ],
        "usageMetadata": {
            "promptTokenCount": 4,
            "candidatesTokenCount": 18,
            "totalTokenCount": 278,
            "trafficType": "ON_DEMAND",
            "promptTokensDetails": [
            {
                "modality": "TEXT",
                "tokenCount": 4
            }
            ],
            "candidatesTokensDetails": [
            {
                "modality": "TEXT",
                "tokenCount": 18
            }
            ],
            "thoughtsTokenCount": 256
        },
        "modelVersion": "gemini-2.5-flash-preview-04-17",
        "createTime": "2025-04-22T03:22:20.094867Z",
        "responseId": "bAsHaJPlBcCWm9IP_6inqAk"
    }
    return mock_response
 def test_vertex_ai_gemini_2_5_flash():
    """
    Test that the vertex_ai/gemini-2.5-flash-preview-04-17 model is working correctly
    """
    litellm.set_verbose = True
    load_vertex_ai_credentials()
    from litellm.llms.custom_httpx.http_handler import HTTPHandler
    client = HTTPHandler()
    with patch.object(client, "post", side_effect=vertex_httpx_mock_post_valid_response_with_thinking_enabled) as mock_client:
        response = completion(
            model="vertex_ai/gemini-2.5-flash-preview-04-17",
            messages=[{"role": "user", "content": "Hello, world!"}],
            client=client,
        )
        mock_client.assert_called()
        assert response.usage is not None
        assert response.usage.completion_tokens_details.thinking_enabled is None
    with patch.object(client, "post", side_effect=vertex_httpx_mock_post_valid_response_with_thinking_enabled) as mock_client:
        response = completion(
            model="vertex_ai/gemini-2.5-flash-preview-04-17",
            messages=[{"role": "user", "content": "Hello, world!"}],
            thinking={"type": "enabled", "budget_tokens": 1024},
            client=client,
        )
        mock_client.assert_called()
        assert response.usage is not None
        assert response.usage.completion_tokens_details.thinking_enabled is True
    # with patch.object(client, "post", side_effect=vertex_httpx_mock_post_valid_response_with_thinking_enabled) as mock_client:
    #     response = completion(
    #         model="vertex_ai/gemini-2.5-flash-preview-04-17",
    #         messages=[{"role": "user", "content": "Hello, world!"}],
    #         thinking={"type": "enabled", "budget_tokens": 0},
    #         client=client,
    #     )
    #     mock_client.assert_called()
    #     assert response.usage is not None
    #     assert response.usage.completion_tokens_details.thinking_enabled is False