mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-24 18:24:20 +00:00
fix(vertex_and_google_ai_Studio_gemini.py): add check when thinking disabled
allows billing to work correctly Fixes https://github.com/BerriAI/litellm/issues/10121
This commit is contained in:
parent
4a50cf10fb
commit
086981858b
3 changed files with 129 additions and 2 deletions
|
@ -743,6 +743,9 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
|
||||||
def _calculate_usage(
|
def _calculate_usage(
|
||||||
self,
|
self,
|
||||||
completion_response: GenerateContentResponseBody,
|
completion_response: GenerateContentResponseBody,
|
||||||
|
is_thinking_enabled: Optional[
|
||||||
|
bool
|
||||||
|
] = None, # gemini-2.5-flash has thinking enabled by default
|
||||||
) -> Usage:
|
) -> Usage:
|
||||||
cached_tokens: Optional[int] = None
|
cached_tokens: Optional[int] = None
|
||||||
audio_tokens: Optional[int] = None
|
audio_tokens: Optional[int] = None
|
||||||
|
@ -763,11 +766,13 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
|
||||||
reasoning_tokens = completion_response["usageMetadata"][
|
reasoning_tokens = completion_response["usageMetadata"][
|
||||||
"thoughtsTokenCount"
|
"thoughtsTokenCount"
|
||||||
]
|
]
|
||||||
|
|
||||||
prompt_tokens_details = PromptTokensDetailsWrapper(
|
prompt_tokens_details = PromptTokensDetailsWrapper(
|
||||||
cached_tokens=cached_tokens,
|
cached_tokens=cached_tokens,
|
||||||
audio_tokens=audio_tokens,
|
audio_tokens=audio_tokens,
|
||||||
text_tokens=text_tokens,
|
text_tokens=text_tokens,
|
||||||
)
|
)
|
||||||
|
|
||||||
## GET USAGE ##
|
## GET USAGE ##
|
||||||
usage = Usage(
|
usage = Usage(
|
||||||
prompt_tokens=completion_response["usageMetadata"].get(
|
prompt_tokens=completion_response["usageMetadata"].get(
|
||||||
|
@ -779,6 +784,7 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
|
||||||
total_tokens=completion_response["usageMetadata"].get("totalTokenCount", 0),
|
total_tokens=completion_response["usageMetadata"].get("totalTokenCount", 0),
|
||||||
prompt_tokens_details=prompt_tokens_details,
|
prompt_tokens_details=prompt_tokens_details,
|
||||||
reasoning_tokens=reasoning_tokens,
|
reasoning_tokens=reasoning_tokens,
|
||||||
|
completion_tokens_details={"thinking_enabled": is_thinking_enabled},
|
||||||
)
|
)
|
||||||
|
|
||||||
return usage
|
return usage
|
||||||
|
@ -849,6 +855,23 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
|
||||||
|
|
||||||
return grounding_metadata, safety_ratings, citation_metadata
|
return grounding_metadata, safety_ratings, citation_metadata
|
||||||
|
|
||||||
|
def _is_thinking_enabled_function(self, optional_params: Dict) -> Optional[bool]:
|
||||||
|
"""
|
||||||
|
Returns true if thinking is enabled for the model
|
||||||
|
"""
|
||||||
|
thinking_config = cast(
|
||||||
|
Optional[GeminiThinkingConfig], optional_params.get("thinkingConfig", None)
|
||||||
|
)
|
||||||
|
|
||||||
|
if thinking_config is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
thinking_budget = thinking_config.get("thinkingBudget")
|
||||||
|
if thinking_budget == 0:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
def transform_response(
|
def transform_response(
|
||||||
self,
|
self,
|
||||||
model: str,
|
model: str,
|
||||||
|
@ -923,7 +946,11 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
|
||||||
_candidates, model_response, litellm_params
|
_candidates, model_response, litellm_params
|
||||||
)
|
)
|
||||||
|
|
||||||
usage = self._calculate_usage(completion_response=completion_response)
|
thinking_enabled = self._is_thinking_enabled_function(optional_params)
|
||||||
|
usage = self._calculate_usage(
|
||||||
|
completion_response=completion_response,
|
||||||
|
is_thinking_enabled=thinking_enabled,
|
||||||
|
)
|
||||||
setattr(model_response, "usage", usage)
|
setattr(model_response, "usage", usage)
|
||||||
|
|
||||||
## ADD METADATA TO RESPONSE ##
|
## ADD METADATA TO RESPONSE ##
|
||||||
|
|
|
@ -795,6 +795,9 @@ class CompletionTokensDetailsWrapper(
|
||||||
CompletionTokensDetails
|
CompletionTokensDetails
|
||||||
): # wrapper for older openai versions
|
): # wrapper for older openai versions
|
||||||
text_tokens: Optional[int] = None
|
text_tokens: Optional[int] = None
|
||||||
|
thinking_enabled: Optional[
|
||||||
|
bool
|
||||||
|
] = None # for gemini-2.5-flash - this changes how billing is calculated
|
||||||
"""Text tokens generated by the model."""
|
"""Text tokens generated by the model."""
|
||||||
|
|
||||||
|
|
||||||
|
@ -853,7 +856,11 @@ class Usage(CompletionUsage):
|
||||||
completion_tokens - reasoning_tokens if completion_tokens else None
|
completion_tokens - reasoning_tokens if completion_tokens else None
|
||||||
)
|
)
|
||||||
completion_tokens_details = CompletionTokensDetailsWrapper(
|
completion_tokens_details = CompletionTokensDetailsWrapper(
|
||||||
reasoning_tokens=reasoning_tokens, text_tokens=text_tokens
|
**{
|
||||||
|
"reasoning_tokens": reasoning_tokens,
|
||||||
|
"text_tokens": text_tokens,
|
||||||
|
**completion_tokens_details,
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
# Ensure completion_tokens_details is properly handled
|
# Ensure completion_tokens_details is properly handled
|
||||||
|
|
|
@ -3521,3 +3521,96 @@ def test_litellm_api_base(monkeypatch, provider, route):
|
||||||
|
|
||||||
mock_client.assert_called()
|
mock_client.assert_called()
|
||||||
assert mock_client.call_args.kwargs["url"].startswith("https://litellm.com")
|
assert mock_client.call_args.kwargs["url"].startswith("https://litellm.com")
|
||||||
|
|
||||||
|
|
||||||
|
def vertex_httpx_mock_post_valid_response_with_thinking_enabled(*args, **kwargs):
|
||||||
|
mock_response = MagicMock()
|
||||||
|
mock_response.status_code = 200
|
||||||
|
mock_response.headers = {"Content-Type": "application/json"}
|
||||||
|
mock_response.json.return_value = {
|
||||||
|
"candidates": [
|
||||||
|
{
|
||||||
|
"content": {
|
||||||
|
"role": "model",
|
||||||
|
"parts": [
|
||||||
|
{
|
||||||
|
"text": "Hello! It's nice to hear from you. How can I help you today?"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"finishReason": "STOP",
|
||||||
|
"avgLogprobs": -6.8490977817111549
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"usageMetadata": {
|
||||||
|
"promptTokenCount": 4,
|
||||||
|
"candidatesTokenCount": 18,
|
||||||
|
"totalTokenCount": 278,
|
||||||
|
"trafficType": "ON_DEMAND",
|
||||||
|
"promptTokensDetails": [
|
||||||
|
{
|
||||||
|
"modality": "TEXT",
|
||||||
|
"tokenCount": 4
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"candidatesTokensDetails": [
|
||||||
|
{
|
||||||
|
"modality": "TEXT",
|
||||||
|
"tokenCount": 18
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"thoughtsTokenCount": 256
|
||||||
|
},
|
||||||
|
"modelVersion": "gemini-2.5-flash-preview-04-17",
|
||||||
|
"createTime": "2025-04-22T03:22:20.094867Z",
|
||||||
|
"responseId": "bAsHaJPlBcCWm9IP_6inqAk"
|
||||||
|
}
|
||||||
|
return mock_response
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def test_vertex_ai_gemini_2_5_flash():
|
||||||
|
"""
|
||||||
|
Test that the vertex_ai/gemini-2.5-flash-preview-04-17 model is working correctly
|
||||||
|
"""
|
||||||
|
litellm.set_verbose = True
|
||||||
|
load_vertex_ai_credentials()
|
||||||
|
from litellm.llms.custom_httpx.http_handler import HTTPHandler
|
||||||
|
client = HTTPHandler()
|
||||||
|
|
||||||
|
with patch.object(client, "post", side_effect=vertex_httpx_mock_post_valid_response_with_thinking_enabled) as mock_client:
|
||||||
|
response = completion(
|
||||||
|
model="vertex_ai/gemini-2.5-flash-preview-04-17",
|
||||||
|
messages=[{"role": "user", "content": "Hello, world!"}],
|
||||||
|
client=client,
|
||||||
|
)
|
||||||
|
|
||||||
|
mock_client.assert_called()
|
||||||
|
assert response.usage is not None
|
||||||
|
assert response.usage.completion_tokens_details.thinking_enabled is None
|
||||||
|
|
||||||
|
|
||||||
|
with patch.object(client, "post", side_effect=vertex_httpx_mock_post_valid_response_with_thinking_enabled) as mock_client:
|
||||||
|
response = completion(
|
||||||
|
model="vertex_ai/gemini-2.5-flash-preview-04-17",
|
||||||
|
messages=[{"role": "user", "content": "Hello, world!"}],
|
||||||
|
thinking={"type": "enabled", "budget_tokens": 1024},
|
||||||
|
client=client,
|
||||||
|
)
|
||||||
|
|
||||||
|
mock_client.assert_called()
|
||||||
|
assert response.usage is not None
|
||||||
|
assert response.usage.completion_tokens_details.thinking_enabled is True
|
||||||
|
|
||||||
|
|
||||||
|
# with patch.object(client, "post", side_effect=vertex_httpx_mock_post_valid_response_with_thinking_enabled) as mock_client:
|
||||||
|
# response = completion(
|
||||||
|
# model="vertex_ai/gemini-2.5-flash-preview-04-17",
|
||||||
|
# messages=[{"role": "user", "content": "Hello, world!"}],
|
||||||
|
# thinking={"type": "enabled", "budget_tokens": 0},
|
||||||
|
# client=client,
|
||||||
|
# )
|
||||||
|
|
||||||
|
# mock_client.assert_called()
|
||||||
|
# assert response.usage is not None
|
||||||
|
# assert response.usage.completion_tokens_details.thinking_enabled is False
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue