fix(vertex_and_google_ai_Studio_gemini.py): add check when thinking disabled

allows billing to work correctly

Fixes https://github.com/BerriAI/litellm/issues/10121
This commit is contained in:
Krrish Dholakia 2025-04-21 20:40:25 -07:00
parent 4a50cf10fb
commit 086981858b
3 changed files with 129 additions and 2 deletions

View file

@ -743,6 +743,9 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
def _calculate_usage(
self,
completion_response: GenerateContentResponseBody,
is_thinking_enabled: Optional[
bool
] = None, # gemini-2.5-flash has thinking enabled by default
) -> Usage:
cached_tokens: Optional[int] = None
audio_tokens: Optional[int] = None
@ -763,11 +766,13 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
reasoning_tokens = completion_response["usageMetadata"][
"thoughtsTokenCount"
]
prompt_tokens_details = PromptTokensDetailsWrapper(
cached_tokens=cached_tokens,
audio_tokens=audio_tokens,
text_tokens=text_tokens,
)
## GET USAGE ##
usage = Usage(
prompt_tokens=completion_response["usageMetadata"].get(
@ -779,6 +784,7 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
total_tokens=completion_response["usageMetadata"].get("totalTokenCount", 0),
prompt_tokens_details=prompt_tokens_details,
reasoning_tokens=reasoning_tokens,
completion_tokens_details={"thinking_enabled": is_thinking_enabled},
)
return usage
@ -849,6 +855,23 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
return grounding_metadata, safety_ratings, citation_metadata
def _is_thinking_enabled_function(self, optional_params: Dict) -> Optional[bool]:
"""
Returns true if thinking is enabled for the model
"""
thinking_config = cast(
Optional[GeminiThinkingConfig], optional_params.get("thinkingConfig", None)
)
if thinking_config is None:
return None
thinking_budget = thinking_config.get("thinkingBudget")
if thinking_budget == 0:
return False
return True
def transform_response(
self,
model: str,
@ -923,7 +946,11 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
_candidates, model_response, litellm_params
)
usage = self._calculate_usage(completion_response=completion_response)
thinking_enabled = self._is_thinking_enabled_function(optional_params)
usage = self._calculate_usage(
completion_response=completion_response,
is_thinking_enabled=thinking_enabled,
)
setattr(model_response, "usage", usage)
## ADD METADATA TO RESPONSE ##

View file

@ -795,6 +795,9 @@ class CompletionTokensDetailsWrapper(
CompletionTokensDetails
): # wrapper for older openai versions
text_tokens: Optional[int] = None
thinking_enabled: Optional[
bool
] = None # for gemini-2.5-flash - this changes how billing is calculated
"""Text tokens generated by the model."""
@ -853,7 +856,11 @@ class Usage(CompletionUsage):
completion_tokens - reasoning_tokens if completion_tokens else None
)
completion_tokens_details = CompletionTokensDetailsWrapper(
reasoning_tokens=reasoning_tokens, text_tokens=text_tokens
**{
"reasoning_tokens": reasoning_tokens,
"text_tokens": text_tokens,
**completion_tokens_details,
}
)
# Ensure completion_tokens_details is properly handled

View file

@ -3521,3 +3521,96 @@ def test_litellm_api_base(monkeypatch, provider, route):
mock_client.assert_called()
assert mock_client.call_args.kwargs["url"].startswith("https://litellm.com")
def vertex_httpx_mock_post_valid_response_with_thinking_enabled(*args, **kwargs):
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.headers = {"Content-Type": "application/json"}
mock_response.json.return_value = {
"candidates": [
{
"content": {
"role": "model",
"parts": [
{
"text": "Hello! It's nice to hear from you. How can I help you today?"
}
]
},
"finishReason": "STOP",
"avgLogprobs": -6.8490977817111549
}
],
"usageMetadata": {
"promptTokenCount": 4,
"candidatesTokenCount": 18,
"totalTokenCount": 278,
"trafficType": "ON_DEMAND",
"promptTokensDetails": [
{
"modality": "TEXT",
"tokenCount": 4
}
],
"candidatesTokensDetails": [
{
"modality": "TEXT",
"tokenCount": 18
}
],
"thoughtsTokenCount": 256
},
"modelVersion": "gemini-2.5-flash-preview-04-17",
"createTime": "2025-04-22T03:22:20.094867Z",
"responseId": "bAsHaJPlBcCWm9IP_6inqAk"
}
return mock_response
def test_vertex_ai_gemini_2_5_flash():
"""
Test that the vertex_ai/gemini-2.5-flash-preview-04-17 model is working correctly
"""
litellm.set_verbose = True
load_vertex_ai_credentials()
from litellm.llms.custom_httpx.http_handler import HTTPHandler
client = HTTPHandler()
with patch.object(client, "post", side_effect=vertex_httpx_mock_post_valid_response_with_thinking_enabled) as mock_client:
response = completion(
model="vertex_ai/gemini-2.5-flash-preview-04-17",
messages=[{"role": "user", "content": "Hello, world!"}],
client=client,
)
mock_client.assert_called()
assert response.usage is not None
assert response.usage.completion_tokens_details.thinking_enabled is None
with patch.object(client, "post", side_effect=vertex_httpx_mock_post_valid_response_with_thinking_enabled) as mock_client:
response = completion(
model="vertex_ai/gemini-2.5-flash-preview-04-17",
messages=[{"role": "user", "content": "Hello, world!"}],
thinking={"type": "enabled", "budget_tokens": 1024},
client=client,
)
mock_client.assert_called()
assert response.usage is not None
assert response.usage.completion_tokens_details.thinking_enabled is True
# with patch.object(client, "post", side_effect=vertex_httpx_mock_post_valid_response_with_thinking_enabled) as mock_client:
# response = completion(
# model="vertex_ai/gemini-2.5-flash-preview-04-17",
# messages=[{"role": "user", "content": "Hello, world!"}],
# thinking={"type": "enabled", "budget_tokens": 0},
# client=client,
# )
# mock_client.assert_called()
# assert response.usage is not None
# assert response.usage.completion_tokens_details.thinking_enabled is False