From a7db0df0434bfbac2b68ebe1c343b77955becb4b Mon Sep 17 00:00:00 2001 From: Krish Dholakia Date: Mon, 21 Apr 2025 22:48:00 -0700 Subject: [PATCH] Gemini-2.5-flash improvements (#10198) * fix(vertex_and_google_ai_studio_gemini.py): allow thinking budget = 0 Fixes https://github.com/BerriAI/litellm/issues/10121 * fix(vertex_and_google_ai_studio_gemini.py): handle nuance in counting exclusive vs. inclusive tokens Addresses https://github.com/BerriAI/litellm/pull/10141#discussion_r2052272035 --- .../vertex_and_google_ai_studio_gemini.py | 35 ++++++++++-- ...test_vertex_and_google_ai_studio_gemini.py | 53 ++++++++++++++++++- tests/llm_translation/test_gemini.py | 20 ++++++- 3 files changed, 102 insertions(+), 6 deletions(-) diff --git a/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py b/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py index d4c74f4910..9ea1c2ee12 100644 --- a/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py +++ b/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py @@ -57,6 +57,7 @@ from litellm.types.llms.vertex_ai import ( LogprobsResult, ToolConfig, Tools, + UsageMetadata, ) from litellm.types.utils import ( ChatCompletionTokenLogprob, @@ -390,7 +391,7 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig): params: GeminiThinkingConfig = {} if thinking_enabled: params["includeThoughts"] = True - if thinking_budget: + if thinking_budget is not None and isinstance(thinking_budget, int): params["thinkingBudget"] = thinking_budget return params @@ -740,6 +741,23 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig): return model_response + def is_candidate_token_count_inclusive(self, usage_metadata: UsageMetadata) -> bool: + """ + Check if the candidate token count is inclusive of the thinking token count + + if prompttokencount + candidatesTokenCount == totalTokenCount, then the candidate token count is inclusive of the thinking token count + + else the candidate token count is exclusive of the thinking token count + + Addresses - https://github.com/BerriAI/litellm/pull/10141#discussion_r2052272035 + """ + if usage_metadata.get("promptTokenCount", 0) + usage_metadata.get( + "candidatesTokenCount", 0 + ) == usage_metadata.get("totalTokenCount", 0): + return True + else: + return False + def _calculate_usage( self, completion_response: GenerateContentResponseBody, @@ -768,14 +786,23 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig): audio_tokens=audio_tokens, text_tokens=text_tokens, ) + + completion_tokens = completion_response["usageMetadata"].get( + "candidatesTokenCount", 0 + ) + if ( + not self.is_candidate_token_count_inclusive( + completion_response["usageMetadata"] + ) + and reasoning_tokens + ): + completion_tokens = reasoning_tokens + completion_tokens ## GET USAGE ## usage = Usage( prompt_tokens=completion_response["usageMetadata"].get( "promptTokenCount", 0 ), - completion_tokens=completion_response["usageMetadata"].get( - "candidatesTokenCount", 0 - ), + completion_tokens=completion_tokens, total_tokens=completion_response["usageMetadata"].get("totalTokenCount", 0), prompt_tokens_details=prompt_tokens_details, reasoning_tokens=reasoning_tokens, diff --git a/tests/litellm/llms/vertex_ai/gemini/test_vertex_and_google_ai_studio_gemini.py b/tests/litellm/llms/vertex_ai/gemini/test_vertex_and_google_ai_studio_gemini.py index 0c6a95a97b..4b1c085bb4 100644 --- a/tests/litellm/llms/vertex_ai/gemini/test_vertex_and_google_ai_studio_gemini.py +++ b/tests/litellm/llms/vertex_ai/gemini/test_vertex_and_google_ai_studio_gemini.py @@ -10,7 +10,8 @@ from litellm import ModelResponse from litellm.llms.vertex_ai.gemini.vertex_and_google_ai_studio_gemini import ( VertexGeminiConfig, ) -from litellm.types.utils import ChoiceLogprobs +from litellm.types.llms.vertex_ai import UsageMetadata +from litellm.types.utils import ChoiceLogprobs, Usage def test_top_logprobs(): @@ -259,3 +260,53 @@ def test_vertex_ai_empty_content(): content, reasoning_content = v.get_assistant_content_message(parts=parts) assert content is None assert reasoning_content is None + + +@pytest.mark.parametrize( + "usage_metadata, inclusive, expected_usage", + [ + ( + UsageMetadata( + promptTokenCount=10, + candidatesTokenCount=10, + totalTokenCount=20, + thoughtsTokenCount=5, + ), + True, + Usage( + prompt_tokens=10, + completion_tokens=10, + total_tokens=20, + reasoning_tokens=5, + ), + ), + ( + UsageMetadata( + promptTokenCount=10, + candidatesTokenCount=5, + totalTokenCount=20, + thoughtsTokenCount=5, + ), + False, + Usage( + prompt_tokens=10, + completion_tokens=10, + total_tokens=20, + reasoning_tokens=5, + ), + ), + ], +) +def test_vertex_ai_candidate_token_count_inclusive( + usage_metadata, inclusive, expected_usage +): + """ + Test that the candidate token count is inclusive of the thinking token count + """ + v = VertexGeminiConfig() + assert v.is_candidate_token_count_inclusive(usage_metadata) is inclusive + + usage = v._calculate_usage(completion_response={"usageMetadata": usage_metadata}) + assert usage.prompt_tokens == expected_usage.prompt_tokens + assert usage.completion_tokens == expected_usage.completion_tokens + assert usage.total_tokens == expected_usage.total_tokens diff --git a/tests/llm_translation/test_gemini.py b/tests/llm_translation/test_gemini.py index 35aa22722e..475c4f03b7 100644 --- a/tests/llm_translation/test_gemini.py +++ b/tests/llm_translation/test_gemini.py @@ -116,4 +116,22 @@ def test_gemini_thinking(): messages=messages, # make sure call works ) print(response.choices[0].message) - assert response.choices[0].message.content is not None \ No newline at end of file + assert response.choices[0].message.content is not None + + +def test_gemini_thinking_budget_0(): + litellm._turn_on_debug() + from litellm.types.utils import Message, CallTypes + from litellm.utils import return_raw_request + import json + + raw_request = return_raw_request( + endpoint=CallTypes.completion, + kwargs={ + "model": "gemini/gemini-2.5-flash-preview-04-17", + "messages": [{"role": "user", "content": "Explain the concept of Occam's Razor and provide a simple, everyday example"}], + "thinking": {"type": "enabled", "budget_tokens": 0} + } + ) + print(raw_request) + assert "0" in json.dumps(raw_request["raw_request_body"]) \ No newline at end of file