Gemini-2.5-flash - support reasoning cost calc + return reasoning content (#10141)

* build(model_prices_and_context_window.json): add vertex ai gemini-2.5-flash pricing

* build(model_prices_and_context_window.json): add gemini reasoning token pricing

* fix(vertex_and_google_ai_studio_gemini.py): support counting thinking tokens for gemini

allows accurate cost calc

* fix(utils.py): add reasoning token cost calc to generic cost calc

ensures gemini-2.5-flash cost calculation is accurate

* build(model_prices_and_context_window.json): mark gemini-2.5-flash as 'supports_reasoning'

* feat(gemini/): support 'thinking' + 'reasoning_effort' params + new unit tests

allow controlling thinking effort for gemini-2.5-flash models

* test: update unit testing

* feat(vertex_and_google_ai_studio_gemini.py): return reasoning content if given in gemini response

* test: update model name

* fix: fix ruff check

* test(test_spend_management_endpoints.py): update tests to be less sensitive to new keys / updates to usage object

* fix(vertex_and_google_ai_studio_gemini.py): fix translation
This commit is contained in:
Krish Dholakia 2025-04-19 09:20:52 -07:00 committed by GitHub
parent db4ebe10c8
commit 36308a31be
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
16 changed files with 453 additions and 88 deletions

View file

@ -21,6 +21,10 @@ DEFAULT_MAX_TOKENS = 256 # used when providers need a default
MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 1024 # 1MB = 1024KB
SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000 # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic.
DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET = 1024
DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET = 2048
DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET = 4096
########## Networking constants ##############################################################
_DEFAULT_TTL_FOR_HTTPX_CLIENTS = 3600 # 1 hour, re-use the same httpx client for 1 hour

View file

@ -267,6 +267,7 @@ def generic_cost_per_token(
## CALCULATE OUTPUT COST
text_tokens = usage.completion_tokens
audio_tokens = 0
reasoning_tokens = 0
if usage.completion_tokens_details is not None:
audio_tokens = (
cast(
@ -282,7 +283,13 @@ def generic_cost_per_token(
)
or usage.completion_tokens # default to completion tokens, if this field is not set
)
reasoning_tokens = (
cast(
Optional[int],
getattr(usage.completion_tokens_details, "reasoning_tokens", 0),
)
or 0
)
## TEXT COST
completion_cost = float(text_tokens) * completion_base_cost
@ -290,6 +297,10 @@ def generic_cost_per_token(
"output_cost_per_audio_token"
)
_output_cost_per_reasoning_token: Optional[float] = model_info.get(
"output_cost_per_reasoning_token"
)
## AUDIO COST
if (
_output_cost_per_audio_token is not None
@ -298,4 +309,12 @@ def generic_cost_per_token(
):
completion_cost += float(audio_tokens) * _output_cost_per_audio_token
## REASONING COST
if (
_output_cost_per_reasoning_token is not None
and reasoning_tokens
and reasoning_tokens > 0
):
completion_cost += float(reasoning_tokens) * _output_cost_per_reasoning_token
return prompt_cost, completion_cost

View file

@ -7,6 +7,9 @@ import httpx
import litellm
from litellm.constants import (
DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS,
DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET,
DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET,
DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET,
RESPONSE_FORMAT_TOOL_NAME,
)
from litellm.litellm_core_utils.core_helpers import map_finish_reason
@ -276,11 +279,20 @@ class AnthropicConfig(AnthropicModelInfo, BaseConfig):
if reasoning_effort is None:
return None
elif reasoning_effort == "low":
return AnthropicThinkingParam(type="enabled", budget_tokens=1024)
return AnthropicThinkingParam(
type="enabled",
budget_tokens=DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET,
)
elif reasoning_effort == "medium":
return AnthropicThinkingParam(type="enabled", budget_tokens=2048)
return AnthropicThinkingParam(
type="enabled",
budget_tokens=DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET,
)
elif reasoning_effort == "high":
return AnthropicThinkingParam(type="enabled", budget_tokens=4096)
return AnthropicThinkingParam(
type="enabled",
budget_tokens=DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET,
)
else:
raise ValueError(f"Unmapped reasoning effort: {reasoning_effort}")

View file

@ -7,6 +7,7 @@ from litellm.litellm_core_utils.prompt_templates.factory import (
)
from litellm.types.llms.openai import AllMessageValues
from litellm.types.llms.vertex_ai import ContentType, PartType
from litellm.utils import supports_reasoning
from ...vertex_ai.gemini.transformation import _gemini_convert_messages_with_history
from ...vertex_ai.gemini.vertex_and_google_ai_studio_gemini import VertexGeminiConfig
@ -67,7 +68,7 @@ class GoogleAIStudioGeminiConfig(VertexGeminiConfig):
return super().get_config()
def get_supported_openai_params(self, model: str) -> List[str]:
return [
supported_params = [
"temperature",
"top_p",
"max_tokens",
@ -83,6 +84,10 @@ class GoogleAIStudioGeminiConfig(VertexGeminiConfig):
"frequency_penalty",
"modalities",
]
if supports_reasoning(model):
supported_params.append("reasoning_effort")
supported_params.append("thinking")
return supported_params
def map_openai_params(
self,

View file

@ -24,6 +24,11 @@ import litellm
import litellm.litellm_core_utils
import litellm.litellm_core_utils.litellm_logging
from litellm import verbose_logger
from litellm.constants import (
DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET,
DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET,
DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET,
)
from litellm.litellm_core_utils.core_helpers import map_finish_reason
from litellm.llms.base_llm.chat.transformation import BaseConfig, BaseLLMException
from litellm.llms.custom_httpx.http_handler import (
@ -31,6 +36,7 @@ from litellm.llms.custom_httpx.http_handler import (
HTTPHandler,
get_async_httpx_client,
)
from litellm.types.llms.anthropic import AnthropicThinkingParam
from litellm.types.llms.openai import (
AllMessageValues,
ChatCompletionResponseMessage,
@ -45,6 +51,7 @@ from litellm.types.llms.vertex_ai import (
ContentType,
FunctionCallingConfig,
FunctionDeclaration,
GeminiThinkingConfig,
GenerateContentResponseBody,
HttpxPartType,
LogprobsResult,
@ -59,7 +66,7 @@ from litellm.types.utils import (
TopLogprob,
Usage,
)
from litellm.utils import CustomStreamWrapper, ModelResponse
from litellm.utils import CustomStreamWrapper, ModelResponse, supports_reasoning
from ....utils import _remove_additional_properties, _remove_strict_from_schema
from ..common_utils import VertexAIError, _build_vertex_schema
@ -190,7 +197,7 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
return super().get_config()
def get_supported_openai_params(self, model: str) -> List[str]:
return [
supported_params = [
"temperature",
"top_p",
"max_tokens",
@ -210,6 +217,10 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
"top_logprobs",
"modalities",
]
if supports_reasoning(model):
supported_params.append("reasoning_effort")
supported_params.append("thinking")
return supported_params
def map_tool_choice_values(
self, model: str, tool_choice: Union[str, dict]
@ -313,10 +324,14 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
if isinstance(old_schema, list):
for item in old_schema:
if isinstance(item, dict):
item = _build_vertex_schema(parameters=item, add_property_ordering=True)
item = _build_vertex_schema(
parameters=item, add_property_ordering=True
)
elif isinstance(old_schema, dict):
old_schema = _build_vertex_schema(parameters=old_schema, add_property_ordering=True)
old_schema = _build_vertex_schema(
parameters=old_schema, add_property_ordering=True
)
return old_schema
def apply_response_schema_transformation(self, value: dict, optional_params: dict):
@ -343,6 +358,43 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
value=optional_params["response_schema"]
)
@staticmethod
def _map_reasoning_effort_to_thinking_budget(
reasoning_effort: str,
) -> GeminiThinkingConfig:
if reasoning_effort == "low":
return {
"thinkingBudget": DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET,
"includeThoughts": True,
}
elif reasoning_effort == "medium":
return {
"thinkingBudget": DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET,
"includeThoughts": True,
}
elif reasoning_effort == "high":
return {
"thinkingBudget": DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET,
"includeThoughts": True,
}
else:
raise ValueError(f"Invalid reasoning effort: {reasoning_effort}")
@staticmethod
def _map_thinking_param(
thinking_param: AnthropicThinkingParam,
) -> GeminiThinkingConfig:
thinking_enabled = thinking_param.get("type") == "enabled"
thinking_budget = thinking_param.get("budget_tokens")
params: GeminiThinkingConfig = {}
if thinking_enabled:
params["includeThoughts"] = True
if thinking_budget:
params["thinkingBudget"] = thinking_budget
return params
def map_openai_params(
self,
non_default_params: Dict,
@ -399,6 +451,16 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
optional_params["tool_choice"] = _tool_choice_value
elif param == "seed":
optional_params["seed"] = value
elif param == "reasoning_effort" and isinstance(value, str):
optional_params[
"thinkingConfig"
] = VertexGeminiConfig._map_reasoning_effort_to_thinking_budget(value)
elif param == "thinking":
optional_params[
"thinkingConfig"
] = VertexGeminiConfig._map_thinking_param(
cast(AnthropicThinkingParam, value)
)
elif param == "modalities" and isinstance(value, list):
response_modalities = []
for modality in value:
@ -514,19 +576,27 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
def get_assistant_content_message(
self, parts: List[HttpxPartType]
) -> Optional[str]:
_content_str = ""
) -> Tuple[Optional[str], Optional[str]]:
content_str: Optional[str] = None
reasoning_content_str: Optional[str] = None
for part in parts:
_content_str = ""
if "text" in part:
_content_str += part["text"]
elif "inlineData" in part: # base64 encoded image
_content_str += "data:{};base64,{}".format(
part["inlineData"]["mimeType"], part["inlineData"]["data"]
)
if part.get("thought") is True:
if reasoning_content_str is None:
reasoning_content_str = ""
reasoning_content_str += _content_str
else:
if content_str is None:
content_str = ""
content_str += _content_str
if _content_str:
return _content_str
return None
return content_str, reasoning_content_str
def _transform_parts(
self,
@ -677,6 +747,7 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
audio_tokens: Optional[int] = None
text_tokens: Optional[int] = None
prompt_tokens_details: Optional[PromptTokensDetailsWrapper] = None
reasoning_tokens: Optional[int] = None
if "cachedContentTokenCount" in completion_response["usageMetadata"]:
cached_tokens = completion_response["usageMetadata"][
"cachedContentTokenCount"
@ -687,7 +758,10 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
audio_tokens = detail["tokenCount"]
elif detail["modality"] == "TEXT":
text_tokens = detail["tokenCount"]
if "thoughtsTokenCount" in completion_response["usageMetadata"]:
reasoning_tokens = completion_response["usageMetadata"][
"thoughtsTokenCount"
]
prompt_tokens_details = PromptTokensDetailsWrapper(
cached_tokens=cached_tokens,
audio_tokens=audio_tokens,
@ -703,6 +777,7 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
),
total_tokens=completion_response["usageMetadata"].get("totalTokenCount", 0),
prompt_tokens_details=prompt_tokens_details,
reasoning_tokens=reasoning_tokens,
)
return usage
@ -731,11 +806,16 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
citation_metadata.append(candidate["citationMetadata"])
if "parts" in candidate["content"]:
chat_completion_message[
"content"
] = VertexGeminiConfig().get_assistant_content_message(
(
content,
reasoning_content,
) = VertexGeminiConfig().get_assistant_content_message(
parts=candidate["content"]["parts"]
)
if content is not None:
chat_completion_message["content"] = content
if reasoning_content is not None:
chat_completion_message["reasoning_content"] = reasoning_content
functions, tools = self._transform_parts(
parts=candidate["content"]["parts"],

View file

@ -5178,9 +5178,10 @@
"max_audio_length_hours": 8.4,
"max_audio_per_prompt": 1,
"max_pdf_size_mb": 30,
"input_cost_per_audio_token": 0.0000001,
"input_cost_per_token": 0.00000015,
"output_cost_per_token": 0.00000060,
"input_cost_per_audio_token": 1e-6,
"input_cost_per_token": 0.15e-6,
"output_cost_per_token": 0.6e-6,
"output_cost_per_reasoning_token": 3.5e-6,
"litellm_provider": "gemini",
"mode": "chat",
"rpm": 10,
@ -5188,9 +5189,39 @@
"supports_system_messages": true,
"supports_function_calling": true,
"supports_vision": true,
"supports_reasoning": true,
"supports_response_schema": true,
"supports_audio_output": false,
"supports_tool_choice": true,
"supported_endpoints": ["/v1/chat/completions", "/v1/completions"],
"supported_modalities": ["text", "image", "audio", "video"],
"supported_output_modalities": ["text"],
"source": "https://ai.google.dev/gemini-api/docs/models#gemini-2.5-flash-preview"
},
"gemini-2.5-flash-preview-04-17": {
"max_tokens": 65536,
"max_input_tokens": 1048576,
"max_output_tokens": 65536,
"max_images_per_prompt": 3000,
"max_videos_per_prompt": 10,
"max_video_length": 1,
"max_audio_length_hours": 8.4,
"max_audio_per_prompt": 1,
"max_pdf_size_mb": 30,
"input_cost_per_audio_token": 1e-6,
"input_cost_per_token": 0.15e-6,
"output_cost_per_token": 0.6e-6,
"output_cost_per_reasoning_token": 3.5e-6,
"litellm_provider": "vertex_ai-language-models",
"mode": "chat",
"supports_reasoning": true,
"supports_system_messages": true,
"supports_function_calling": true,
"supports_vision": true,
"supports_response_schema": true,
"supports_audio_output": false,
"supports_tool_choice": true,
"supported_endpoints": ["/v1/chat/completions", "/v1/completions", "/v1/batch"],
"supported_modalities": ["text", "image", "audio", "video"],
"supported_output_modalities": ["text"],
"source": "https://ai.google.dev/gemini-api/docs/models#gemini-2.5-flash-preview"

View file

@ -69,6 +69,7 @@ class HttpxPartType(TypedDict, total=False):
functionResponse: FunctionResponse
executableCode: HttpxExecutableCode
codeExecutionResult: HttpxCodeExecutionResult
thought: bool
class HttpxContentType(TypedDict, total=False):
@ -166,6 +167,11 @@ class SafetSettingsConfig(TypedDict, total=False):
method: HarmBlockMethod
class GeminiThinkingConfig(TypedDict, total=False):
includeThoughts: bool
thinkingBudget: int
class GenerationConfig(TypedDict, total=False):
temperature: float
top_p: float
@ -181,6 +187,7 @@ class GenerationConfig(TypedDict, total=False):
responseLogprobs: bool
logprobs: int
responseModalities: List[Literal["TEXT", "IMAGE", "AUDIO", "VIDEO"]]
thinkingConfig: GeminiThinkingConfig
class Tools(TypedDict, total=False):
@ -212,6 +219,7 @@ class UsageMetadata(TypedDict, total=False):
candidatesTokenCount: int
cachedContentTokenCount: int
promptTokensDetails: List[PromptTokensDetails]
thoughtsTokenCount: int
class CachedContent(TypedDict, total=False):

View file

@ -150,6 +150,7 @@ class ModelInfoBase(ProviderSpecificModelInfo, total=False):
] # only for vertex ai models
output_cost_per_image: Optional[float]
output_vector_size: Optional[int]
output_cost_per_reasoning_token: Optional[float]
output_cost_per_video_per_second: Optional[float] # only for vertex ai models
output_cost_per_audio_per_second: Optional[float] # only for vertex ai models
output_cost_per_second: Optional[float] # for OpenAI Speech models
@ -829,8 +830,11 @@ class Usage(CompletionUsage):
# handle reasoning_tokens
_completion_tokens_details: Optional[CompletionTokensDetailsWrapper] = None
if reasoning_tokens:
text_tokens = (
completion_tokens - reasoning_tokens if completion_tokens else None
)
completion_tokens_details = CompletionTokensDetailsWrapper(
reasoning_tokens=reasoning_tokens
reasoning_tokens=reasoning_tokens, text_tokens=text_tokens
)
# Ensure completion_tokens_details is properly handled

View file

@ -516,9 +516,9 @@ def function_setup( # noqa: PLR0915
function_id: Optional[str] = kwargs["id"] if "id" in kwargs else None
## DYNAMIC CALLBACKS ##
dynamic_callbacks: Optional[List[Union[str, Callable, CustomLogger]]] = (
kwargs.pop("callbacks", None)
)
dynamic_callbacks: Optional[
List[Union[str, Callable, CustomLogger]]
] = kwargs.pop("callbacks", None)
all_callbacks = get_dynamic_callbacks(dynamic_callbacks=dynamic_callbacks)
if len(all_callbacks) > 0:
@ -1202,9 +1202,9 @@ def client(original_function): # noqa: PLR0915
exception=e,
retry_policy=kwargs.get("retry_policy"),
)
kwargs["retry_policy"] = (
reset_retry_policy()
) # prevent infinite loops
kwargs[
"retry_policy"
] = reset_retry_policy() # prevent infinite loops
litellm.num_retries = (
None # set retries to None to prevent infinite loops
)
@ -3013,16 +3013,16 @@ def get_optional_params( # noqa: PLR0915
True # so that main.py adds the function call to the prompt
)
if "tools" in non_default_params:
optional_params["functions_unsupported_model"] = (
non_default_params.pop("tools")
)
optional_params[
"functions_unsupported_model"
] = non_default_params.pop("tools")
non_default_params.pop(
"tool_choice", None
) # causes ollama requests to hang
elif "functions" in non_default_params:
optional_params["functions_unsupported_model"] = (
non_default_params.pop("functions")
)
optional_params[
"functions_unsupported_model"
] = non_default_params.pop("functions")
elif (
litellm.add_function_to_prompt
): # if user opts to add it to prompt instead
@ -3045,10 +3045,10 @@ def get_optional_params( # noqa: PLR0915
if "response_format" in non_default_params:
if provider_config is not None:
non_default_params["response_format"] = (
provider_config.get_json_schema_from_pydantic_object(
response_format=non_default_params["response_format"]
)
non_default_params[
"response_format"
] = provider_config.get_json_schema_from_pydantic_object(
response_format=non_default_params["response_format"]
)
else:
non_default_params["response_format"] = type_to_response_format_param(
@ -4064,9 +4064,9 @@ def _count_characters(text: str) -> int:
def get_response_string(response_obj: Union[ModelResponse, ModelResponseStream]) -> str:
_choices: Union[List[Union[Choices, StreamingChoices]], List[StreamingChoices]] = (
response_obj.choices
)
_choices: Union[
List[Union[Choices, StreamingChoices]], List[StreamingChoices]
] = response_obj.choices
response_str = ""
for choice in _choices:
@ -4563,6 +4563,9 @@ def _get_model_info_helper( # noqa: PLR0915
output_cost_per_character=_model_info.get(
"output_cost_per_character", None
),
output_cost_per_reasoning_token=_model_info.get(
"output_cost_per_reasoning_token", None
),
output_cost_per_token_above_128k_tokens=_model_info.get(
"output_cost_per_token_above_128k_tokens", None
),