mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-24 18:24:20 +00:00
fix(vertex_and_google_ai_Studio_gemini.py): add check when thinking disabled
allows billing to work correctly Fixes https://github.com/BerriAI/litellm/issues/10121
This commit is contained in:
parent
4a50cf10fb
commit
086981858b
3 changed files with 129 additions and 2 deletions
|
@ -743,6 +743,9 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
|
|||
def _calculate_usage(
|
||||
self,
|
||||
completion_response: GenerateContentResponseBody,
|
||||
is_thinking_enabled: Optional[
|
||||
bool
|
||||
] = None, # gemini-2.5-flash has thinking enabled by default
|
||||
) -> Usage:
|
||||
cached_tokens: Optional[int] = None
|
||||
audio_tokens: Optional[int] = None
|
||||
|
@ -763,11 +766,13 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
|
|||
reasoning_tokens = completion_response["usageMetadata"][
|
||||
"thoughtsTokenCount"
|
||||
]
|
||||
|
||||
prompt_tokens_details = PromptTokensDetailsWrapper(
|
||||
cached_tokens=cached_tokens,
|
||||
audio_tokens=audio_tokens,
|
||||
text_tokens=text_tokens,
|
||||
)
|
||||
|
||||
## GET USAGE ##
|
||||
usage = Usage(
|
||||
prompt_tokens=completion_response["usageMetadata"].get(
|
||||
|
@ -779,6 +784,7 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
|
|||
total_tokens=completion_response["usageMetadata"].get("totalTokenCount", 0),
|
||||
prompt_tokens_details=prompt_tokens_details,
|
||||
reasoning_tokens=reasoning_tokens,
|
||||
completion_tokens_details={"thinking_enabled": is_thinking_enabled},
|
||||
)
|
||||
|
||||
return usage
|
||||
|
@ -849,6 +855,23 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
|
|||
|
||||
return grounding_metadata, safety_ratings, citation_metadata
|
||||
|
||||
def _is_thinking_enabled_function(self, optional_params: Dict) -> Optional[bool]:
|
||||
"""
|
||||
Returns true if thinking is enabled for the model
|
||||
"""
|
||||
thinking_config = cast(
|
||||
Optional[GeminiThinkingConfig], optional_params.get("thinkingConfig", None)
|
||||
)
|
||||
|
||||
if thinking_config is None:
|
||||
return None
|
||||
|
||||
thinking_budget = thinking_config.get("thinkingBudget")
|
||||
if thinking_budget == 0:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def transform_response(
|
||||
self,
|
||||
model: str,
|
||||
|
@ -923,7 +946,11 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
|
|||
_candidates, model_response, litellm_params
|
||||
)
|
||||
|
||||
usage = self._calculate_usage(completion_response=completion_response)
|
||||
thinking_enabled = self._is_thinking_enabled_function(optional_params)
|
||||
usage = self._calculate_usage(
|
||||
completion_response=completion_response,
|
||||
is_thinking_enabled=thinking_enabled,
|
||||
)
|
||||
setattr(model_response, "usage", usage)
|
||||
|
||||
## ADD METADATA TO RESPONSE ##
|
||||
|
|
|
@ -795,6 +795,9 @@ class CompletionTokensDetailsWrapper(
|
|||
CompletionTokensDetails
|
||||
): # wrapper for older openai versions
|
||||
text_tokens: Optional[int] = None
|
||||
thinking_enabled: Optional[
|
||||
bool
|
||||
] = None # for gemini-2.5-flash - this changes how billing is calculated
|
||||
"""Text tokens generated by the model."""
|
||||
|
||||
|
||||
|
@ -853,7 +856,11 @@ class Usage(CompletionUsage):
|
|||
completion_tokens - reasoning_tokens if completion_tokens else None
|
||||
)
|
||||
completion_tokens_details = CompletionTokensDetailsWrapper(
|
||||
reasoning_tokens=reasoning_tokens, text_tokens=text_tokens
|
||||
**{
|
||||
"reasoning_tokens": reasoning_tokens,
|
||||
"text_tokens": text_tokens,
|
||||
**completion_tokens_details,
|
||||
}
|
||||
)
|
||||
|
||||
# Ensure completion_tokens_details is properly handled
|
||||
|
|
|
@ -3521,3 +3521,96 @@ def test_litellm_api_base(monkeypatch, provider, route):
|
|||
|
||||
mock_client.assert_called()
|
||||
assert mock_client.call_args.kwargs["url"].startswith("https://litellm.com")
|
||||
|
||||
|
||||
def vertex_httpx_mock_post_valid_response_with_thinking_enabled(*args, **kwargs):
|
||||
mock_response = MagicMock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.headers = {"Content-Type": "application/json"}
|
||||
mock_response.json.return_value = {
|
||||
"candidates": [
|
||||
{
|
||||
"content": {
|
||||
"role": "model",
|
||||
"parts": [
|
||||
{
|
||||
"text": "Hello! It's nice to hear from you. How can I help you today?"
|
||||
}
|
||||
]
|
||||
},
|
||||
"finishReason": "STOP",
|
||||
"avgLogprobs": -6.8490977817111549
|
||||
}
|
||||
],
|
||||
"usageMetadata": {
|
||||
"promptTokenCount": 4,
|
||||
"candidatesTokenCount": 18,
|
||||
"totalTokenCount": 278,
|
||||
"trafficType": "ON_DEMAND",
|
||||
"promptTokensDetails": [
|
||||
{
|
||||
"modality": "TEXT",
|
||||
"tokenCount": 4
|
||||
}
|
||||
],
|
||||
"candidatesTokensDetails": [
|
||||
{
|
||||
"modality": "TEXT",
|
||||
"tokenCount": 18
|
||||
}
|
||||
],
|
||||
"thoughtsTokenCount": 256
|
||||
},
|
||||
"modelVersion": "gemini-2.5-flash-preview-04-17",
|
||||
"createTime": "2025-04-22T03:22:20.094867Z",
|
||||
"responseId": "bAsHaJPlBcCWm9IP_6inqAk"
|
||||
}
|
||||
return mock_response
|
||||
|
||||
|
||||
|
||||
def test_vertex_ai_gemini_2_5_flash():
|
||||
"""
|
||||
Test that the vertex_ai/gemini-2.5-flash-preview-04-17 model is working correctly
|
||||
"""
|
||||
litellm.set_verbose = True
|
||||
load_vertex_ai_credentials()
|
||||
from litellm.llms.custom_httpx.http_handler import HTTPHandler
|
||||
client = HTTPHandler()
|
||||
|
||||
with patch.object(client, "post", side_effect=vertex_httpx_mock_post_valid_response_with_thinking_enabled) as mock_client:
|
||||
response = completion(
|
||||
model="vertex_ai/gemini-2.5-flash-preview-04-17",
|
||||
messages=[{"role": "user", "content": "Hello, world!"}],
|
||||
client=client,
|
||||
)
|
||||
|
||||
mock_client.assert_called()
|
||||
assert response.usage is not None
|
||||
assert response.usage.completion_tokens_details.thinking_enabled is None
|
||||
|
||||
|
||||
with patch.object(client, "post", side_effect=vertex_httpx_mock_post_valid_response_with_thinking_enabled) as mock_client:
|
||||
response = completion(
|
||||
model="vertex_ai/gemini-2.5-flash-preview-04-17",
|
||||
messages=[{"role": "user", "content": "Hello, world!"}],
|
||||
thinking={"type": "enabled", "budget_tokens": 1024},
|
||||
client=client,
|
||||
)
|
||||
|
||||
mock_client.assert_called()
|
||||
assert response.usage is not None
|
||||
assert response.usage.completion_tokens_details.thinking_enabled is True
|
||||
|
||||
|
||||
# with patch.object(client, "post", side_effect=vertex_httpx_mock_post_valid_response_with_thinking_enabled) as mock_client:
|
||||
# response = completion(
|
||||
# model="vertex_ai/gemini-2.5-flash-preview-04-17",
|
||||
# messages=[{"role": "user", "content": "Hello, world!"}],
|
||||
# thinking={"type": "enabled", "budget_tokens": 0},
|
||||
# client=client,
|
||||
# )
|
||||
|
||||
# mock_client.assert_called()
|
||||
# assert response.usage is not None
|
||||
# assert response.usage.completion_tokens_details.thinking_enabled is False
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue