mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-25 02:34:29 +00:00
Gemini 2.5 Flash output cost is based on thinking enabled
This commit is contained in:
parent
0b63c7a2eb
commit
58c2551fa6
7 changed files with 74 additions and 10 deletions
|
@ -128,6 +128,10 @@ def _get_token_base_cost(model_info: ModelInfo, usage: Usage) -> Tuple[float, fl
|
|||
except Exception:
|
||||
continue
|
||||
|
||||
output_cost_per_token_thinking = model_info.get("output_cost_per_token_thinking")
|
||||
if usage.get("thinking_enabled") and output_cost_per_token_thinking is not None:
|
||||
completion_base_cost = output_cost_per_token_thinking
|
||||
|
||||
return prompt_base_cost, completion_base_cost
|
||||
|
||||
|
||||
|
|
|
@ -743,6 +743,7 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
|
|||
def _calculate_usage(
|
||||
self,
|
||||
completion_response: GenerateContentResponseBody,
|
||||
thinking_enabled: bool | None,
|
||||
) -> Usage:
|
||||
cached_tokens: Optional[int] = None
|
||||
audio_tokens: Optional[int] = None
|
||||
|
@ -779,6 +780,7 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
|
|||
total_tokens=completion_response["usageMetadata"].get("totalTokenCount", 0),
|
||||
prompt_tokens_details=prompt_tokens_details,
|
||||
reasoning_tokens=reasoning_tokens,
|
||||
thinking_enabled=thinking_enabled,
|
||||
)
|
||||
|
||||
return usage
|
||||
|
@ -910,6 +912,16 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
|
|||
completion_response=completion_response,
|
||||
)
|
||||
|
||||
thinking_enabled = None
|
||||
if "gemini-2.5-flash" in model:
|
||||
# Only Gemini 2.5 Flash can have its thinking disabled by setting the thinking budget to zero
|
||||
thinking_budget = (
|
||||
request_data.get("generationConfig", {})
|
||||
.get("thinkingConfig", {})
|
||||
.get("thinkingBudget")
|
||||
)
|
||||
thinking_enabled = thinking_budget != 0
|
||||
|
||||
model_response.choices = []
|
||||
|
||||
try:
|
||||
|
@ -923,7 +935,10 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
|
|||
_candidates, model_response, litellm_params
|
||||
)
|
||||
|
||||
usage = self._calculate_usage(completion_response=completion_response)
|
||||
usage = self._calculate_usage(
|
||||
completion_response=completion_response,
|
||||
thinking_enabled=thinking_enabled,
|
||||
)
|
||||
setattr(model_response, "usage", usage)
|
||||
|
||||
## ADD METADATA TO RESPONSE ##
|
||||
|
|
|
@ -5413,7 +5413,7 @@
|
|||
"input_cost_per_audio_token": 1e-6,
|
||||
"input_cost_per_token": 0.15e-6,
|
||||
"output_cost_per_token": 0.6e-6,
|
||||
"output_cost_per_reasoning_token": 3.5e-6,
|
||||
"output_cost_per_token_thinking": 3.5e-6,
|
||||
"litellm_provider": "gemini",
|
||||
"mode": "chat",
|
||||
"rpm": 10,
|
||||
|
@ -5443,7 +5443,7 @@
|
|||
"input_cost_per_audio_token": 1e-6,
|
||||
"input_cost_per_token": 0.15e-6,
|
||||
"output_cost_per_token": 0.6e-6,
|
||||
"output_cost_per_reasoning_token": 3.5e-6,
|
||||
"output_cost_per_token_thinking": 3.5e-6,
|
||||
"litellm_provider": "vertex_ai-language-models",
|
||||
"mode": "chat",
|
||||
"supports_reasoning": true,
|
||||
|
|
|
@ -138,6 +138,9 @@ class ModelInfoBase(ProviderSpecificModelInfo, total=False):
|
|||
input_cost_per_token_batches: Optional[float]
|
||||
output_cost_per_token_batches: Optional[float]
|
||||
output_cost_per_token: Required[float]
|
||||
output_cost_per_token_thinking: Optional[
|
||||
float
|
||||
] # only for vertex ai gemini-2.5-flash models
|
||||
output_cost_per_character: Optional[float] # only for vertex ai models
|
||||
output_cost_per_audio_token: Optional[float]
|
||||
output_cost_per_token_above_128k_tokens: Optional[
|
||||
|
|
|
@ -4557,6 +4557,9 @@ def _get_model_info_helper( # noqa: PLR0915
|
|||
"output_cost_per_token_batches"
|
||||
),
|
||||
output_cost_per_token=_output_cost_per_token,
|
||||
output_cost_per_token_thinking=_model_info.get(
|
||||
"output_cost_per_token_thinking", None
|
||||
),
|
||||
output_cost_per_audio_token=_model_info.get(
|
||||
"output_cost_per_audio_token", None
|
||||
),
|
||||
|
|
|
@ -5413,7 +5413,7 @@
|
|||
"input_cost_per_audio_token": 1e-6,
|
||||
"input_cost_per_token": 0.15e-6,
|
||||
"output_cost_per_token": 0.6e-6,
|
||||
"output_cost_per_reasoning_token": 3.5e-6,
|
||||
"output_cost_per_token_thinking": 3.5e-6,
|
||||
"litellm_provider": "gemini",
|
||||
"mode": "chat",
|
||||
"rpm": 10,
|
||||
|
@ -5443,7 +5443,7 @@
|
|||
"input_cost_per_audio_token": 1e-6,
|
||||
"input_cost_per_token": 0.15e-6,
|
||||
"output_cost_per_token": 0.6e-6,
|
||||
"output_cost_per_reasoning_token": 3.5e-6,
|
||||
"output_cost_per_token_thinking": 3.5e-6,
|
||||
"litellm_provider": "vertex_ai-language-models",
|
||||
"mode": "chat",
|
||||
"supports_reasoning": true,
|
||||
|
|
|
@ -87,6 +87,49 @@ def test_reasoning_tokens_gemini():
|
|||
prompt_tokens_details=PromptTokensDetailsWrapper(
|
||||
audio_tokens=None, cached_tokens=None, text_tokens=17, image_tokens=None
|
||||
),
|
||||
thinking_enabled=True,
|
||||
)
|
||||
model_cost_map = litellm.model_cost[model]
|
||||
prompt_cost, completion_cost = generic_cost_per_token(
|
||||
model=model,
|
||||
usage=usage,
|
||||
custom_llm_provider=custom_llm_provider,
|
||||
)
|
||||
|
||||
assert round(prompt_cost, 10) == round(
|
||||
model_cost_map["input_cost_per_token"] * usage.prompt_tokens,
|
||||
10,
|
||||
)
|
||||
assert round(completion_cost, 10) == round(
|
||||
(
|
||||
model_cost_map["output_cost_per_token_thinking"]
|
||||
* usage.completion_tokens
|
||||
),
|
||||
10,
|
||||
)
|
||||
|
||||
|
||||
def test_reasoning_disabled_tokens_gemini():
|
||||
model = "gemini-2.5-flash-preview-04-17"
|
||||
custom_llm_provider = "gemini"
|
||||
os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
|
||||
litellm.model_cost = litellm.get_model_cost_map(url="")
|
||||
|
||||
usage = Usage(
|
||||
completion_tokens=1578,
|
||||
prompt_tokens=17,
|
||||
total_tokens=1595,
|
||||
completion_tokens_details=CompletionTokensDetailsWrapper(
|
||||
accepted_prediction_tokens=None,
|
||||
audio_tokens=None,
|
||||
reasoning_tokens=None,
|
||||
rejected_prediction_tokens=None,
|
||||
text_tokens=1578,
|
||||
),
|
||||
prompt_tokens_details=PromptTokensDetailsWrapper(
|
||||
audio_tokens=None, cached_tokens=None, text_tokens=17, image_tokens=None
|
||||
),
|
||||
thinking_enabled=False,
|
||||
)
|
||||
model_cost_map = litellm.model_cost[model]
|
||||
prompt_cost, completion_cost = generic_cost_per_token(
|
||||
|
@ -102,11 +145,7 @@ def test_reasoning_tokens_gemini():
|
|||
assert round(completion_cost, 10) == round(
|
||||
(
|
||||
model_cost_map["output_cost_per_token"]
|
||||
* usage.completion_tokens_details.text_tokens
|
||||
)
|
||||
+ (
|
||||
model_cost_map["output_cost_per_reasoning_token"]
|
||||
* usage.completion_tokens_details.reasoning_tokens
|
||||
* usage.completion_tokens
|
||||
),
|
||||
10,
|
||||
)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue