Gemini-2.5-flash improvements (#10198)

* fix(vertex_and_google_ai_studio_gemini.py): allow thinking budget = 0 Fixes https://github.com/BerriAI/litellm/issues/10121 * fix(vertex_and_google_ai_studio_gemini.py): handle nuance in counting exclusive vs. inclusive tokens Addresses https://github.com/BerriAI/litellm/pull/10141#discussion_r2052272035
2025-04-26 03:04:13 +00:00 · 2025-04-21 22:48:00 -07:00 · 2025-04-21 22:48:00 -07:00 · 6cd8330fc5
commit 6cd8330fc5
parent bdfb6c5a76
3 changed files with 102 additions and 6 deletions
--- a/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py
+++ b/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py
@ -57,6 +57,7 @@ from litellm.types.llms.vertex_ai import (
    LogprobsResult,
    ToolConfig,
    Tools,
    UsageMetadata,
 )
 from litellm.types.utils import (
    ChatCompletionTokenLogprob,
@ -390,7 +391,7 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
        params: GeminiThinkingConfig = {}
        if thinking_enabled:
            params["includeThoughts"] = True
-        if thinking_budget:
+        if thinking_budget is not None and isinstance(thinking_budget, int):
            params["thinkingBudget"] = thinking_budget
        return params
@ -740,6 +741,23 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
        return model_response
    def is_candidate_token_count_inclusive(self, usage_metadata: UsageMetadata) -> bool:
        """
        Check if the candidate token count is inclusive of the thinking token count
        if prompttokencount + candidatesTokenCount == totalTokenCount, then the candidate token count is inclusive of the thinking token count
        else the candidate token count is exclusive of the thinking token count
        Addresses - https://github.com/BerriAI/litellm/pull/10141#discussion_r2052272035
        """
        if usage_metadata.get("promptTokenCount", 0) + usage_metadata.get(
            "candidatesTokenCount", 0
        ) == usage_metadata.get("totalTokenCount", 0):
            return True
        else:
            return False
    def _calculate_usage(
        self,
        completion_response: GenerateContentResponseBody,
@ -768,14 +786,23 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
            audio_tokens=audio_tokens,
            text_tokens=text_tokens,
        )
        completion_tokens = completion_response["usageMetadata"].get(
            "candidatesTokenCount", 0
        )
        if (
            not self.is_candidate_token_count_inclusive(
                completion_response["usageMetadata"]
            )
            and reasoning_tokens
        ):
            completion_tokens = reasoning_tokens + completion_tokens
        ## GET USAGE ##
        usage = Usage(
            prompt_tokens=completion_response["usageMetadata"].get(
                "promptTokenCount", 0
            ),
-            completion_tokens=completion_response["usageMetadata"].get(
+            completion_tokens=completion_tokens,
                "candidatesTokenCount", 0
            ),
            total_tokens=completion_response["usageMetadata"].get("totalTokenCount", 0),
            prompt_tokens_details=prompt_tokens_details,
            reasoning_tokens=reasoning_tokens,
--- a/tests/litellm/llms/vertex_ai/gemini/test_vertex_and_google_ai_studio_gemini.py
+++ b/tests/litellm/llms/vertex_ai/gemini/test_vertex_and_google_ai_studio_gemini.py
@ -10,7 +10,8 @@ from litellm import ModelResponse
 from litellm.llms.vertex_ai.gemini.vertex_and_google_ai_studio_gemini import (
    VertexGeminiConfig,
 )
-from litellm.types.utils import ChoiceLogprobs
+from litellm.types.llms.vertex_ai import UsageMetadata
 from litellm.types.utils import ChoiceLogprobs, Usage
 def test_top_logprobs():
@ -259,3 +260,53 @@ def test_vertex_ai_empty_content():
    content, reasoning_content = v.get_assistant_content_message(parts=parts)
    assert content is None
    assert reasoning_content is None
@pytest.mark.parametrize(
    "usage_metadata, inclusive, expected_usage",
    [
        (
            UsageMetadata(
                promptTokenCount=10,
                candidatesTokenCount=10,
                totalTokenCount=20,
                thoughtsTokenCount=5,
            ),
            True,
            Usage(
                prompt_tokens=10,
                completion_tokens=10,
                total_tokens=20,
                reasoning_tokens=5,
            ),
        ),
        (
            UsageMetadata(
                promptTokenCount=10,
                candidatesTokenCount=5,
                totalTokenCount=20,
                thoughtsTokenCount=5,
            ),
            False,
            Usage(
                prompt_tokens=10,
                completion_tokens=10,
                total_tokens=20,
                reasoning_tokens=5,
            ),
        ),
    ],
 )
 def test_vertex_ai_candidate_token_count_inclusive(
    usage_metadata, inclusive, expected_usage
 ):
    """
    Test that the candidate token count is inclusive of the thinking token count
    """
    v = VertexGeminiConfig()
    assert v.is_candidate_token_count_inclusive(usage_metadata) is inclusive
    usage = v._calculate_usage(completion_response={"usageMetadata": usage_metadata})
    assert usage.prompt_tokens == expected_usage.prompt_tokens
    assert usage.completion_tokens == expected_usage.completion_tokens
    assert usage.total_tokens == expected_usage.total_tokens
--- a/tests/llm_translation/test_gemini.py
+++ b/tests/llm_translation/test_gemini.py
@ -116,4 +116,22 @@ def test_gemini_thinking():
        messages=messages, # make sure call works
    )
    print(response.choices[0].message)
-    assert response.choices[0].message.content is not None
+    assert response.choices[0].message.content is not None
 def test_gemini_thinking_budget_0():
    litellm._turn_on_debug()
    from litellm.types.utils import Message, CallTypes
    from litellm.utils import return_raw_request
    import json
    raw_request = return_raw_request(
        endpoint=CallTypes.completion,
        kwargs={
            "model": "gemini/gemini-2.5-flash-preview-04-17",
            "messages": [{"role": "user", "content": "Explain the concept of Occam's Razor and provide a simple, everyday example"}],
            "thinking": {"type": "enabled", "budget_tokens": 0}
        }
    )
    print(raw_request)
    assert "0" in json.dumps(raw_request["raw_request_body"])