This commit is contained in:
Andrew Wesie 2025-04-21 20:36:38 -07:00 committed by GitHub
commit cbab62f586
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 142 additions and 19 deletions

View file

@ -128,6 +128,10 @@ def _get_token_base_cost(model_info: ModelInfo, usage: Usage) -> Tuple[float, fl
except Exception:
continue
output_cost_per_token_thinking = model_info.get("output_cost_per_token_thinking")
if usage.get("thinking_enabled") and output_cost_per_token_thinking is not None:
completion_base_cost = output_cost_per_token_thinking
return prompt_base_cost, completion_base_cost

View file

@ -365,17 +365,14 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
if reasoning_effort == "low":
return {
"thinkingBudget": DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET,
"includeThoughts": True,
}
elif reasoning_effort == "medium":
return {
"thinkingBudget": DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET,
"includeThoughts": True,
}
elif reasoning_effort == "high":
return {
"thinkingBudget": DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET,
"includeThoughts": True,
}
else:
raise ValueError(f"Invalid reasoning effort: {reasoning_effort}")
@ -388,9 +385,9 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
thinking_budget = thinking_param.get("budget_tokens")
params: GeminiThinkingConfig = {}
if thinking_enabled:
params["includeThoughts"] = True
if thinking_budget:
if not thinking_enabled:
params["thinkingBudget"] = 0
elif thinking_budget is not None:
params["thinkingBudget"] = thinking_budget
return params
@ -743,6 +740,7 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
def _calculate_usage(
self,
completion_response: GenerateContentResponseBody,
thinking_enabled: bool | None,
) -> Usage:
cached_tokens: Optional[int] = None
audio_tokens: Optional[int] = None
@ -768,17 +766,24 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
audio_tokens=audio_tokens,
text_tokens=text_tokens,
)
completion_tokens = completion_response["usageMetadata"].get(
"candidatesTokenCount", 0
)
if reasoning_tokens:
# Usage(...) constructor expects that completion_tokens includes the reasoning_tokens.
# However the Vertex AI usage metadata does not include reasoning tokens in candidatesTokenCount.
# Reportedly, this is different from the Gemini API.
completion_tokens += reasoning_tokens
## GET USAGE ##
usage = Usage(
prompt_tokens=completion_response["usageMetadata"].get(
"promptTokenCount", 0
),
completion_tokens=completion_response["usageMetadata"].get(
"candidatesTokenCount", 0
),
completion_tokens=completion_tokens,
total_tokens=completion_response["usageMetadata"].get("totalTokenCount", 0),
prompt_tokens_details=prompt_tokens_details,
reasoning_tokens=reasoning_tokens,
thinking_enabled=thinking_enabled,
)
return usage
@ -910,6 +915,16 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
completion_response=completion_response,
)
thinking_enabled = None
if "gemini-2.5-flash" in model:
# Only Gemini 2.5 Flash can have its thinking disabled by setting the thinking budget to zero
thinking_budget = (
request_data.get("generationConfig", {})
.get("thinkingConfig", {})
.get("thinkingBudget")
)
thinking_enabled = thinking_budget != 0
model_response.choices = []
try:
@ -923,7 +938,10 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
_candidates, model_response, litellm_params
)
usage = self._calculate_usage(completion_response=completion_response)
usage = self._calculate_usage(
completion_response=completion_response,
thinking_enabled=thinking_enabled,
)
setattr(model_response, "usage", usage)
## ADD METADATA TO RESPONSE ##

View file

@ -5413,7 +5413,7 @@
"input_cost_per_audio_token": 1e-6,
"input_cost_per_token": 0.15e-6,
"output_cost_per_token": 0.6e-6,
"output_cost_per_reasoning_token": 3.5e-6,
"output_cost_per_token_thinking": 3.5e-6,
"litellm_provider": "gemini",
"mode": "chat",
"rpm": 10,
@ -5443,7 +5443,7 @@
"input_cost_per_audio_token": 1e-6,
"input_cost_per_token": 0.15e-6,
"output_cost_per_token": 0.6e-6,
"output_cost_per_reasoning_token": 3.5e-6,
"output_cost_per_token_thinking": 3.5e-6,
"litellm_provider": "vertex_ai-language-models",
"mode": "chat",
"supports_reasoning": true,

View file

@ -138,6 +138,9 @@ class ModelInfoBase(ProviderSpecificModelInfo, total=False):
input_cost_per_token_batches: Optional[float]
output_cost_per_token_batches: Optional[float]
output_cost_per_token: Required[float]
output_cost_per_token_thinking: Optional[
float
] # only for vertex ai gemini-2.5-flash models
output_cost_per_character: Optional[float] # only for vertex ai models
output_cost_per_audio_token: Optional[float]
output_cost_per_token_above_128k_tokens: Optional[

View file

@ -4572,6 +4572,9 @@ def _get_model_info_helper( # noqa: PLR0915
"output_cost_per_token_batches"
),
output_cost_per_token=_output_cost_per_token,
output_cost_per_token_thinking=_model_info.get(
"output_cost_per_token_thinking", None
),
output_cost_per_audio_token=_model_info.get(
"output_cost_per_audio_token", None
),

View file

@ -5413,7 +5413,7 @@
"input_cost_per_audio_token": 1e-6,
"input_cost_per_token": 0.15e-6,
"output_cost_per_token": 0.6e-6,
"output_cost_per_reasoning_token": 3.5e-6,
"output_cost_per_token_thinking": 3.5e-6,
"litellm_provider": "gemini",
"mode": "chat",
"rpm": 10,
@ -5443,7 +5443,7 @@
"input_cost_per_audio_token": 1e-6,
"input_cost_per_token": 0.15e-6,
"output_cost_per_token": 0.6e-6,
"output_cost_per_reasoning_token": 3.5e-6,
"output_cost_per_token_thinking": 3.5e-6,
"litellm_provider": "vertex_ai-language-models",
"mode": "chat",
"supports_reasoning": true,

View file

@ -87,6 +87,49 @@ def test_reasoning_tokens_gemini():
prompt_tokens_details=PromptTokensDetailsWrapper(
audio_tokens=None, cached_tokens=None, text_tokens=17, image_tokens=None
),
thinking_enabled=True,
)
model_cost_map = litellm.model_cost[model]
prompt_cost, completion_cost = generic_cost_per_token(
model=model,
usage=usage,
custom_llm_provider=custom_llm_provider,
)
assert round(prompt_cost, 10) == round(
model_cost_map["input_cost_per_token"] * usage.prompt_tokens,
10,
)
assert round(completion_cost, 10) == round(
(
model_cost_map["output_cost_per_token_thinking"]
* usage.completion_tokens
),
10,
)
def test_reasoning_disabled_tokens_gemini():
model = "gemini-2.5-flash-preview-04-17"
custom_llm_provider = "gemini"
os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
litellm.model_cost = litellm.get_model_cost_map(url="")
usage = Usage(
completion_tokens=1578,
prompt_tokens=17,
total_tokens=1595,
completion_tokens_details=CompletionTokensDetailsWrapper(
accepted_prediction_tokens=None,
audio_tokens=None,
reasoning_tokens=None,
rejected_prediction_tokens=None,
text_tokens=1578,
),
prompt_tokens_details=PromptTokensDetailsWrapper(
audio_tokens=None, cached_tokens=None, text_tokens=17, image_tokens=None
),
thinking_enabled=False,
)
model_cost_map = litellm.model_cost[model]
prompt_cost, completion_cost = generic_cost_per_token(
@ -102,11 +145,7 @@ def test_reasoning_tokens_gemini():
assert round(completion_cost, 10) == round(
(
model_cost_map["output_cost_per_token"]
* usage.completion_tokens_details.text_tokens
)
+ (
model_cost_map["output_cost_per_reasoning_token"]
* usage.completion_tokens_details.reasoning_tokens
* usage.completion_tokens
),
10,
)

View file

@ -259,3 +259,59 @@ def test_vertex_ai_empty_content():
content, reasoning_content = v.get_assistant_content_message(parts=parts)
assert content is None
assert reasoning_content is None
def test_vertex_ai_thinking_disabled():
from litellm.llms.vertex_ai.gemini.vertex_and_google_ai_studio_gemini import (
VertexGeminiConfig,
)
from litellm.types.llms.anthropic import AnthropicThinkingParam
v = VertexGeminiConfig()
optional_params = v.map_openai_params(
non_default_params={
"thinking": AnthropicThinkingParam(type="enabled", budget_tokens=0),
},
optional_params={},
model="gemini-2.5-flash-preview-04-17",
drop_params=False,
)
assert optional_params["thinkingConfig"]["thinkingBudget"] == 0
optional_params = v.map_openai_params(
non_default_params={
"thinking": AnthropicThinkingParam(type="enabled"),
},
optional_params={},
model="gemini-2.5-flash-preview-04-17",
drop_params=False,
)
assert "thinkingBudget" not in optional_params["thinkingConfig"]
optional_params = v.map_openai_params(
non_default_params={
"thinking": AnthropicThinkingParam(type="enabled", budget_tokens=1024),
},
optional_params={},
model="gemini-2.5-flash-preview-04-17",
drop_params=False,
)
assert optional_params["thinkingConfig"]["thinkingBudget"] == 1024
optional_params = v.map_openai_params(
non_default_params={
"thinking": cast(AnthropicThinkingParam, {"type": "invalid"}),
},
optional_params={},
model="gemini-2.5-flash-preview-04-17",
drop_params=False,
)
assert optional_params["thinkingConfig"]["thinkingBudget"] == 0
optional_params = v.map_openai_params(
non_default_params={},
optional_params={},
model="gemini-2.5-flash-preview-04-17",
drop_params=False,
)
assert "thinkingConfig" not in optional_params