Gemini 2.5 Flash output cost is based on thinking enabled

This commit is contained in:
Andrew Wesie 2025-04-20 17:17:05 -05:00
parent 0b63c7a2eb
commit 58c2551fa6
7 changed files with 74 additions and 10 deletions

View file

@ -128,6 +128,10 @@ def _get_token_base_cost(model_info: ModelInfo, usage: Usage) -> Tuple[float, fl
except Exception:
continue
output_cost_per_token_thinking = model_info.get("output_cost_per_token_thinking")
if usage.get("thinking_enabled") and output_cost_per_token_thinking is not None:
completion_base_cost = output_cost_per_token_thinking
return prompt_base_cost, completion_base_cost

View file

@ -743,6 +743,7 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
def _calculate_usage(
self,
completion_response: GenerateContentResponseBody,
thinking_enabled: bool | None,
) -> Usage:
cached_tokens: Optional[int] = None
audio_tokens: Optional[int] = None
@ -779,6 +780,7 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
total_tokens=completion_response["usageMetadata"].get("totalTokenCount", 0),
prompt_tokens_details=prompt_tokens_details,
reasoning_tokens=reasoning_tokens,
thinking_enabled=thinking_enabled,
)
return usage
@ -910,6 +912,16 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
completion_response=completion_response,
)
thinking_enabled = None
if "gemini-2.5-flash" in model:
# Only Gemini 2.5 Flash can have its thinking disabled by setting the thinking budget to zero
thinking_budget = (
request_data.get("generationConfig", {})
.get("thinkingConfig", {})
.get("thinkingBudget")
)
thinking_enabled = thinking_budget != 0
model_response.choices = []
try:
@ -923,7 +935,10 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
_candidates, model_response, litellm_params
)
usage = self._calculate_usage(completion_response=completion_response)
usage = self._calculate_usage(
completion_response=completion_response,
thinking_enabled=thinking_enabled,
)
setattr(model_response, "usage", usage)
## ADD METADATA TO RESPONSE ##

View file

@ -5413,7 +5413,7 @@
"input_cost_per_audio_token": 1e-6,
"input_cost_per_token": 0.15e-6,
"output_cost_per_token": 0.6e-6,
"output_cost_per_reasoning_token": 3.5e-6,
"output_cost_per_token_thinking": 3.5e-6,
"litellm_provider": "gemini",
"mode": "chat",
"rpm": 10,
@ -5443,7 +5443,7 @@
"input_cost_per_audio_token": 1e-6,
"input_cost_per_token": 0.15e-6,
"output_cost_per_token": 0.6e-6,
"output_cost_per_reasoning_token": 3.5e-6,
"output_cost_per_token_thinking": 3.5e-6,
"litellm_provider": "vertex_ai-language-models",
"mode": "chat",
"supports_reasoning": true,

View file

@ -138,6 +138,9 @@ class ModelInfoBase(ProviderSpecificModelInfo, total=False):
input_cost_per_token_batches: Optional[float]
output_cost_per_token_batches: Optional[float]
output_cost_per_token: Required[float]
output_cost_per_token_thinking: Optional[
float
] # only for vertex ai gemini-2.5-flash models
output_cost_per_character: Optional[float] # only for vertex ai models
output_cost_per_audio_token: Optional[float]
output_cost_per_token_above_128k_tokens: Optional[

View file

@ -4557,6 +4557,9 @@ def _get_model_info_helper( # noqa: PLR0915
"output_cost_per_token_batches"
),
output_cost_per_token=_output_cost_per_token,
output_cost_per_token_thinking=_model_info.get(
"output_cost_per_token_thinking", None
),
output_cost_per_audio_token=_model_info.get(
"output_cost_per_audio_token", None
),

View file

@ -5413,7 +5413,7 @@
"input_cost_per_audio_token": 1e-6,
"input_cost_per_token": 0.15e-6,
"output_cost_per_token": 0.6e-6,
"output_cost_per_reasoning_token": 3.5e-6,
"output_cost_per_token_thinking": 3.5e-6,
"litellm_provider": "gemini",
"mode": "chat",
"rpm": 10,
@ -5443,7 +5443,7 @@
"input_cost_per_audio_token": 1e-6,
"input_cost_per_token": 0.15e-6,
"output_cost_per_token": 0.6e-6,
"output_cost_per_reasoning_token": 3.5e-6,
"output_cost_per_token_thinking": 3.5e-6,
"litellm_provider": "vertex_ai-language-models",
"mode": "chat",
"supports_reasoning": true,

View file

@ -87,6 +87,49 @@ def test_reasoning_tokens_gemini():
prompt_tokens_details=PromptTokensDetailsWrapper(
audio_tokens=None, cached_tokens=None, text_tokens=17, image_tokens=None
),
thinking_enabled=True,
)
model_cost_map = litellm.model_cost[model]
prompt_cost, completion_cost = generic_cost_per_token(
model=model,
usage=usage,
custom_llm_provider=custom_llm_provider,
)
assert round(prompt_cost, 10) == round(
model_cost_map["input_cost_per_token"] * usage.prompt_tokens,
10,
)
assert round(completion_cost, 10) == round(
(
model_cost_map["output_cost_per_token_thinking"]
* usage.completion_tokens
),
10,
)
def test_reasoning_disabled_tokens_gemini():
model = "gemini-2.5-flash-preview-04-17"
custom_llm_provider = "gemini"
os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
litellm.model_cost = litellm.get_model_cost_map(url="")
usage = Usage(
completion_tokens=1578,
prompt_tokens=17,
total_tokens=1595,
completion_tokens_details=CompletionTokensDetailsWrapper(
accepted_prediction_tokens=None,
audio_tokens=None,
reasoning_tokens=None,
rejected_prediction_tokens=None,
text_tokens=1578,
),
prompt_tokens_details=PromptTokensDetailsWrapper(
audio_tokens=None, cached_tokens=None, text_tokens=17, image_tokens=None
),
thinking_enabled=False,
)
model_cost_map = litellm.model_cost[model]
prompt_cost, completion_cost = generic_cost_per_token(
@ -102,11 +145,7 @@ def test_reasoning_tokens_gemini():
assert round(completion_cost, 10) == round(
(
model_cost_map["output_cost_per_token"]
* usage.completion_tokens_details.text_tokens
)
+ (
model_cost_map["output_cost_per_reasoning_token"]
* usage.completion_tokens_details.reasoning_tokens
* usage.completion_tokens
),
10,
)