mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-26 03:04:13 +00:00
Gemini-2.5-flash improvements (#10198)
* fix(vertex_and_google_ai_studio_gemini.py): allow thinking budget = 0 Fixes https://github.com/BerriAI/litellm/issues/10121 * fix(vertex_and_google_ai_studio_gemini.py): handle nuance in counting exclusive vs. inclusive tokens Addresses https://github.com/BerriAI/litellm/pull/10141#discussion_r2052272035
This commit is contained in:
parent
bdfb6c5a76
commit
6cd8330fc5
3 changed files with 102 additions and 6 deletions
|
@ -57,6 +57,7 @@ from litellm.types.llms.vertex_ai import (
|
||||||
LogprobsResult,
|
LogprobsResult,
|
||||||
ToolConfig,
|
ToolConfig,
|
||||||
Tools,
|
Tools,
|
||||||
|
UsageMetadata,
|
||||||
)
|
)
|
||||||
from litellm.types.utils import (
|
from litellm.types.utils import (
|
||||||
ChatCompletionTokenLogprob,
|
ChatCompletionTokenLogprob,
|
||||||
|
@ -390,7 +391,7 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
|
||||||
params: GeminiThinkingConfig = {}
|
params: GeminiThinkingConfig = {}
|
||||||
if thinking_enabled:
|
if thinking_enabled:
|
||||||
params["includeThoughts"] = True
|
params["includeThoughts"] = True
|
||||||
if thinking_budget:
|
if thinking_budget is not None and isinstance(thinking_budget, int):
|
||||||
params["thinkingBudget"] = thinking_budget
|
params["thinkingBudget"] = thinking_budget
|
||||||
|
|
||||||
return params
|
return params
|
||||||
|
@ -740,6 +741,23 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
|
||||||
|
|
||||||
return model_response
|
return model_response
|
||||||
|
|
||||||
|
def is_candidate_token_count_inclusive(self, usage_metadata: UsageMetadata) -> bool:
|
||||||
|
"""
|
||||||
|
Check if the candidate token count is inclusive of the thinking token count
|
||||||
|
|
||||||
|
if prompttokencount + candidatesTokenCount == totalTokenCount, then the candidate token count is inclusive of the thinking token count
|
||||||
|
|
||||||
|
else the candidate token count is exclusive of the thinking token count
|
||||||
|
|
||||||
|
Addresses - https://github.com/BerriAI/litellm/pull/10141#discussion_r2052272035
|
||||||
|
"""
|
||||||
|
if usage_metadata.get("promptTokenCount", 0) + usage_metadata.get(
|
||||||
|
"candidatesTokenCount", 0
|
||||||
|
) == usage_metadata.get("totalTokenCount", 0):
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
def _calculate_usage(
|
def _calculate_usage(
|
||||||
self,
|
self,
|
||||||
completion_response: GenerateContentResponseBody,
|
completion_response: GenerateContentResponseBody,
|
||||||
|
@ -768,14 +786,23 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
|
||||||
audio_tokens=audio_tokens,
|
audio_tokens=audio_tokens,
|
||||||
text_tokens=text_tokens,
|
text_tokens=text_tokens,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
completion_tokens = completion_response["usageMetadata"].get(
|
||||||
|
"candidatesTokenCount", 0
|
||||||
|
)
|
||||||
|
if (
|
||||||
|
not self.is_candidate_token_count_inclusive(
|
||||||
|
completion_response["usageMetadata"]
|
||||||
|
)
|
||||||
|
and reasoning_tokens
|
||||||
|
):
|
||||||
|
completion_tokens = reasoning_tokens + completion_tokens
|
||||||
## GET USAGE ##
|
## GET USAGE ##
|
||||||
usage = Usage(
|
usage = Usage(
|
||||||
prompt_tokens=completion_response["usageMetadata"].get(
|
prompt_tokens=completion_response["usageMetadata"].get(
|
||||||
"promptTokenCount", 0
|
"promptTokenCount", 0
|
||||||
),
|
),
|
||||||
completion_tokens=completion_response["usageMetadata"].get(
|
completion_tokens=completion_tokens,
|
||||||
"candidatesTokenCount", 0
|
|
||||||
),
|
|
||||||
total_tokens=completion_response["usageMetadata"].get("totalTokenCount", 0),
|
total_tokens=completion_response["usageMetadata"].get("totalTokenCount", 0),
|
||||||
prompt_tokens_details=prompt_tokens_details,
|
prompt_tokens_details=prompt_tokens_details,
|
||||||
reasoning_tokens=reasoning_tokens,
|
reasoning_tokens=reasoning_tokens,
|
||||||
|
|
|
@ -10,7 +10,8 @@ from litellm import ModelResponse
|
||||||
from litellm.llms.vertex_ai.gemini.vertex_and_google_ai_studio_gemini import (
|
from litellm.llms.vertex_ai.gemini.vertex_and_google_ai_studio_gemini import (
|
||||||
VertexGeminiConfig,
|
VertexGeminiConfig,
|
||||||
)
|
)
|
||||||
from litellm.types.utils import ChoiceLogprobs
|
from litellm.types.llms.vertex_ai import UsageMetadata
|
||||||
|
from litellm.types.utils import ChoiceLogprobs, Usage
|
||||||
|
|
||||||
|
|
||||||
def test_top_logprobs():
|
def test_top_logprobs():
|
||||||
|
@ -259,3 +260,53 @@ def test_vertex_ai_empty_content():
|
||||||
content, reasoning_content = v.get_assistant_content_message(parts=parts)
|
content, reasoning_content = v.get_assistant_content_message(parts=parts)
|
||||||
assert content is None
|
assert content is None
|
||||||
assert reasoning_content is None
|
assert reasoning_content is None
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"usage_metadata, inclusive, expected_usage",
|
||||||
|
[
|
||||||
|
(
|
||||||
|
UsageMetadata(
|
||||||
|
promptTokenCount=10,
|
||||||
|
candidatesTokenCount=10,
|
||||||
|
totalTokenCount=20,
|
||||||
|
thoughtsTokenCount=5,
|
||||||
|
),
|
||||||
|
True,
|
||||||
|
Usage(
|
||||||
|
prompt_tokens=10,
|
||||||
|
completion_tokens=10,
|
||||||
|
total_tokens=20,
|
||||||
|
reasoning_tokens=5,
|
||||||
|
),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
UsageMetadata(
|
||||||
|
promptTokenCount=10,
|
||||||
|
candidatesTokenCount=5,
|
||||||
|
totalTokenCount=20,
|
||||||
|
thoughtsTokenCount=5,
|
||||||
|
),
|
||||||
|
False,
|
||||||
|
Usage(
|
||||||
|
prompt_tokens=10,
|
||||||
|
completion_tokens=10,
|
||||||
|
total_tokens=20,
|
||||||
|
reasoning_tokens=5,
|
||||||
|
),
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_vertex_ai_candidate_token_count_inclusive(
|
||||||
|
usage_metadata, inclusive, expected_usage
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Test that the candidate token count is inclusive of the thinking token count
|
||||||
|
"""
|
||||||
|
v = VertexGeminiConfig()
|
||||||
|
assert v.is_candidate_token_count_inclusive(usage_metadata) is inclusive
|
||||||
|
|
||||||
|
usage = v._calculate_usage(completion_response={"usageMetadata": usage_metadata})
|
||||||
|
assert usage.prompt_tokens == expected_usage.prompt_tokens
|
||||||
|
assert usage.completion_tokens == expected_usage.completion_tokens
|
||||||
|
assert usage.total_tokens == expected_usage.total_tokens
|
||||||
|
|
|
@ -116,4 +116,22 @@ def test_gemini_thinking():
|
||||||
messages=messages, # make sure call works
|
messages=messages, # make sure call works
|
||||||
)
|
)
|
||||||
print(response.choices[0].message)
|
print(response.choices[0].message)
|
||||||
assert response.choices[0].message.content is not None
|
assert response.choices[0].message.content is not None
|
||||||
|
|
||||||
|
|
||||||
|
def test_gemini_thinking_budget_0():
|
||||||
|
litellm._turn_on_debug()
|
||||||
|
from litellm.types.utils import Message, CallTypes
|
||||||
|
from litellm.utils import return_raw_request
|
||||||
|
import json
|
||||||
|
|
||||||
|
raw_request = return_raw_request(
|
||||||
|
endpoint=CallTypes.completion,
|
||||||
|
kwargs={
|
||||||
|
"model": "gemini/gemini-2.5-flash-preview-04-17",
|
||||||
|
"messages": [{"role": "user", "content": "Explain the concept of Occam's Razor and provide a simple, everyday example"}],
|
||||||
|
"thinking": {"type": "enabled", "budget_tokens": 0}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
print(raw_request)
|
||||||
|
assert "0" in json.dumps(raw_request["raw_request_body"])
|
Loading…
Add table
Add a link
Reference in a new issue