Gemini-2.5-flash improvements (#10198)

* fix(vertex_and_google_ai_studio_gemini.py): allow thinking budget = 0

Fixes https://github.com/BerriAI/litellm/issues/10121

* fix(vertex_and_google_ai_studio_gemini.py): handle nuance in counting exclusive vs. inclusive tokens

Addresses https://github.com/BerriAI/litellm/pull/10141#discussion_r2052272035
This commit is contained in:
Krish Dholakia 2025-04-21 22:48:00 -07:00 committed by GitHub
parent d1fb051d25
commit a7db0df043
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 102 additions and 6 deletions

View file

@ -57,6 +57,7 @@ from litellm.types.llms.vertex_ai import (
LogprobsResult,
ToolConfig,
Tools,
UsageMetadata,
)
from litellm.types.utils import (
ChatCompletionTokenLogprob,
@ -390,7 +391,7 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
params: GeminiThinkingConfig = {}
if thinking_enabled:
params["includeThoughts"] = True
if thinking_budget:
if thinking_budget is not None and isinstance(thinking_budget, int):
params["thinkingBudget"] = thinking_budget
return params
@ -740,6 +741,23 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
return model_response
def is_candidate_token_count_inclusive(self, usage_metadata: UsageMetadata) -> bool:
"""
Check if the candidate token count is inclusive of the thinking token count
if prompttokencount + candidatesTokenCount == totalTokenCount, then the candidate token count is inclusive of the thinking token count
else the candidate token count is exclusive of the thinking token count
Addresses - https://github.com/BerriAI/litellm/pull/10141#discussion_r2052272035
"""
if usage_metadata.get("promptTokenCount", 0) + usage_metadata.get(
"candidatesTokenCount", 0
) == usage_metadata.get("totalTokenCount", 0):
return True
else:
return False
def _calculate_usage(
self,
completion_response: GenerateContentResponseBody,
@ -768,14 +786,23 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
audio_tokens=audio_tokens,
text_tokens=text_tokens,
)
completion_tokens = completion_response["usageMetadata"].get(
"candidatesTokenCount", 0
)
if (
not self.is_candidate_token_count_inclusive(
completion_response["usageMetadata"]
)
and reasoning_tokens
):
completion_tokens = reasoning_tokens + completion_tokens
## GET USAGE ##
usage = Usage(
prompt_tokens=completion_response["usageMetadata"].get(
"promptTokenCount", 0
),
completion_tokens=completion_response["usageMetadata"].get(
"candidatesTokenCount", 0
),
completion_tokens=completion_tokens,
total_tokens=completion_response["usageMetadata"].get("totalTokenCount", 0),
prompt_tokens_details=prompt_tokens_details,
reasoning_tokens=reasoning_tokens,

View file

@ -10,7 +10,8 @@ from litellm import ModelResponse
from litellm.llms.vertex_ai.gemini.vertex_and_google_ai_studio_gemini import (
VertexGeminiConfig,
)
from litellm.types.utils import ChoiceLogprobs
from litellm.types.llms.vertex_ai import UsageMetadata
from litellm.types.utils import ChoiceLogprobs, Usage
def test_top_logprobs():
@ -259,3 +260,53 @@ def test_vertex_ai_empty_content():
content, reasoning_content = v.get_assistant_content_message(parts=parts)
assert content is None
assert reasoning_content is None
@pytest.mark.parametrize(
"usage_metadata, inclusive, expected_usage",
[
(
UsageMetadata(
promptTokenCount=10,
candidatesTokenCount=10,
totalTokenCount=20,
thoughtsTokenCount=5,
),
True,
Usage(
prompt_tokens=10,
completion_tokens=10,
total_tokens=20,
reasoning_tokens=5,
),
),
(
UsageMetadata(
promptTokenCount=10,
candidatesTokenCount=5,
totalTokenCount=20,
thoughtsTokenCount=5,
),
False,
Usage(
prompt_tokens=10,
completion_tokens=10,
total_tokens=20,
reasoning_tokens=5,
),
),
],
)
def test_vertex_ai_candidate_token_count_inclusive(
usage_metadata, inclusive, expected_usage
):
"""
Test that the candidate token count is inclusive of the thinking token count
"""
v = VertexGeminiConfig()
assert v.is_candidate_token_count_inclusive(usage_metadata) is inclusive
usage = v._calculate_usage(completion_response={"usageMetadata": usage_metadata})
assert usage.prompt_tokens == expected_usage.prompt_tokens
assert usage.completion_tokens == expected_usage.completion_tokens
assert usage.total_tokens == expected_usage.total_tokens

View file

@ -116,4 +116,22 @@ def test_gemini_thinking():
messages=messages, # make sure call works
)
print(response.choices[0].message)
assert response.choices[0].message.content is not None
assert response.choices[0].message.content is not None
def test_gemini_thinking_budget_0():
litellm._turn_on_debug()
from litellm.types.utils import Message, CallTypes
from litellm.utils import return_raw_request
import json
raw_request = return_raw_request(
endpoint=CallTypes.completion,
kwargs={
"model": "gemini/gemini-2.5-flash-preview-04-17",
"messages": [{"role": "user", "content": "Explain the concept of Occam's Razor and provide a simple, everyday example"}],
"thinking": {"type": "enabled", "budget_tokens": 0}
}
)
print(raw_request)
assert "0" in json.dumps(raw_request["raw_request_body"])