Gemini-2.5-flash - support reasoning cost calc + return reasoning content (#10141)

* build(model_prices_and_context_window.json): add vertex ai gemini-2.5-flash pricing * build(model_prices_and_context_window.json): add gemini reasoning token pricing * fix(vertex_and_google_ai_studio_gemini.py): support counting thinking tokens for gemini allows accurate cost calc * fix(utils.py): add reasoning token cost calc to generic cost calc ensures gemini-2.5-flash cost calculation is accurate * build(model_prices_and_context_window.json): mark gemini-2.5-flash as 'supports_reasoning' * feat(gemini/): support 'thinking' + 'reasoning_effort' params + new unit tests allow controlling thinking effort for gemini-2.5-flash models * test: update unit testing * feat(vertex_and_google_ai_studio_gemini.py): return reasoning content if given in gemini response * test: update model name * fix: fix ruff check * test(test_spend_management_endpoints.py): update tests to be less sensitive to new keys / updates to usage object * fix(vertex_and_google_ai_studio_gemini.py): fix translation
2025-04-25 02:34:29 +00:00 · 2025-04-19 09:20:52 -07:00 · 2025-04-19 09:20:52 -07:00 · 36308a31be
commit 36308a31be
parent db4ebe10c8
16 changed files with 453 additions and 88 deletions
--- a/litellm/constants.py
+++ b/litellm/constants.py
@ -21,6 +21,10 @@ DEFAULT_MAX_TOKENS = 256  # used when providers need a default
 MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 1024  # 1MB = 1024KB
 SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000  # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic.

+DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET = 1024
+DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET = 2048
+DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET = 4096
+
 ########## Networking constants ##############################################################
 _DEFAULT_TTL_FOR_HTTPX_CLIENTS = 3600  # 1 hour, re-use the same httpx client for 1 hour

--- a/litellm/litellm_core_utils/llm_cost_calc/utils.py
+++ b/litellm/litellm_core_utils/llm_cost_calc/utils.py
@ -267,6 +267,7 @@ def generic_cost_per_token(
    ## CALCULATE OUTPUT COST
    text_tokens = usage.completion_tokens
    audio_tokens = 0
+    reasoning_tokens = 0
    if usage.completion_tokens_details is not None:
        audio_tokens = (
            cast(
@ -282,7 +283,13 @@ def generic_cost_per_token(
            )
            or usage.completion_tokens  # default to completion tokens, if this field is not set
        )
-
+        reasoning_tokens = (
+            cast(
+                Optional[int],
+                getattr(usage.completion_tokens_details, "reasoning_tokens", 0),
+            )
+            or 0
+        )
    ## TEXT COST
    completion_cost = float(text_tokens) * completion_base_cost

@ -290,6 +297,10 @@ def generic_cost_per_token(
        "output_cost_per_audio_token"
    )

+    _output_cost_per_reasoning_token: Optional[float] = model_info.get(
+        "output_cost_per_reasoning_token"
+    )
+
    ## AUDIO COST
    if (
        _output_cost_per_audio_token is not None
@ -298,4 +309,12 @@ def generic_cost_per_token(
    ):
        completion_cost += float(audio_tokens) * _output_cost_per_audio_token

+    ## REASONING COST
+    if (
+        _output_cost_per_reasoning_token is not None
+        and reasoning_tokens
+        and reasoning_tokens > 0
+    ):
+        completion_cost += float(reasoning_tokens) * _output_cost_per_reasoning_token
+
    return prompt_cost, completion_cost
--- a/litellm/llms/anthropic/chat/transformation.py
+++ b/litellm/llms/anthropic/chat/transformation.py
@ -7,6 +7,9 @@ import httpx
 import litellm
 from litellm.constants import (
    DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS,
+    DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET,
+    DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET,
+    DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET,
    RESPONSE_FORMAT_TOOL_NAME,
 )
 from litellm.litellm_core_utils.core_helpers import map_finish_reason
@ -276,11 +279,20 @@ class AnthropicConfig(AnthropicModelInfo, BaseConfig):
        if reasoning_effort is None:
            return None
        elif reasoning_effort == "low":
-            return AnthropicThinkingParam(type="enabled", budget_tokens=1024)
+            return AnthropicThinkingParam(
+                type="enabled",
+                budget_tokens=DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET,
+            )
        elif reasoning_effort == "medium":
-            return AnthropicThinkingParam(type="enabled", budget_tokens=2048)
+            return AnthropicThinkingParam(
+                type="enabled",
+                budget_tokens=DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET,
+            )
        elif reasoning_effort == "high":
-            return AnthropicThinkingParam(type="enabled", budget_tokens=4096)
+            return AnthropicThinkingParam(
+                type="enabled",
+                budget_tokens=DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET,
+            )
        else:
            raise ValueError(f"Unmapped reasoning effort: {reasoning_effort}")

--- a/litellm/llms/gemini/chat/transformation.py
+++ b/litellm/llms/gemini/chat/transformation.py
@ -7,6 +7,7 @@ from litellm.litellm_core_utils.prompt_templates.factory import (
 )
 from litellm.types.llms.openai import AllMessageValues
 from litellm.types.llms.vertex_ai import ContentType, PartType
+from litellm.utils import supports_reasoning

 from ...vertex_ai.gemini.transformation import _gemini_convert_messages_with_history
 from ...vertex_ai.gemini.vertex_and_google_ai_studio_gemini import VertexGeminiConfig
@ -67,7 +68,7 @@ class GoogleAIStudioGeminiConfig(VertexGeminiConfig):
        return super().get_config()

    def get_supported_openai_params(self, model: str) -> List[str]:
-        return [
+        supported_params = [
            "temperature",
            "top_p",
            "max_tokens",
@ -83,6 +84,10 @@ class GoogleAIStudioGeminiConfig(VertexGeminiConfig):
            "frequency_penalty",
            "modalities",
        ]
+        if supports_reasoning(model):
+            supported_params.append("reasoning_effort")
+            supported_params.append("thinking")
+        return supported_params

    def map_openai_params(
        self,
--- a/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py
+++ b/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py
@ -24,6 +24,11 @@ import litellm
 import litellm.litellm_core_utils
 import litellm.litellm_core_utils.litellm_logging
 from litellm import verbose_logger
+from litellm.constants import (
+    DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET,
+    DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET,
+    DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET,
+)
 from litellm.litellm_core_utils.core_helpers import map_finish_reason
 from litellm.llms.base_llm.chat.transformation import BaseConfig, BaseLLMException
 from litellm.llms.custom_httpx.http_handler import (
@ -31,6 +36,7 @@ from litellm.llms.custom_httpx.http_handler import (
    HTTPHandler,
    get_async_httpx_client,
 )
+from litellm.types.llms.anthropic import AnthropicThinkingParam
 from litellm.types.llms.openai import (
    AllMessageValues,
    ChatCompletionResponseMessage,
@ -45,6 +51,7 @@ from litellm.types.llms.vertex_ai import (
    ContentType,
    FunctionCallingConfig,
    FunctionDeclaration,
+    GeminiThinkingConfig,
    GenerateContentResponseBody,
    HttpxPartType,
    LogprobsResult,
@ -59,7 +66,7 @@ from litellm.types.utils import (
    TopLogprob,
    Usage,
 )
-from litellm.utils import CustomStreamWrapper, ModelResponse
+from litellm.utils import CustomStreamWrapper, ModelResponse, supports_reasoning

 from ....utils import _remove_additional_properties, _remove_strict_from_schema
 from ..common_utils import VertexAIError, _build_vertex_schema
@ -190,7 +197,7 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
        return super().get_config()

    def get_supported_openai_params(self, model: str) -> List[str]:
-        return [
+        supported_params = [
            "temperature",
            "top_p",
            "max_tokens",
@ -210,6 +217,10 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
            "top_logprobs",
            "modalities",
        ]
+        if supports_reasoning(model):
+            supported_params.append("reasoning_effort")
+            supported_params.append("thinking")
+        return supported_params

    def map_tool_choice_values(
        self, model: str, tool_choice: Union[str, dict]
@ -313,10 +324,14 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
        if isinstance(old_schema, list):
            for item in old_schema:
                if isinstance(item, dict):
-                    item = _build_vertex_schema(parameters=item, add_property_ordering=True)
+                    item = _build_vertex_schema(
+                        parameters=item, add_property_ordering=True
+                    )

        elif isinstance(old_schema, dict):
-            old_schema = _build_vertex_schema(parameters=old_schema, add_property_ordering=True)
+            old_schema = _build_vertex_schema(
+                parameters=old_schema, add_property_ordering=True
+            )
        return old_schema

    def apply_response_schema_transformation(self, value: dict, optional_params: dict):
@ -343,6 +358,43 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
                value=optional_params["response_schema"]
            )

+    @staticmethod
+    def _map_reasoning_effort_to_thinking_budget(
+        reasoning_effort: str,
+    ) -> GeminiThinkingConfig:
+        if reasoning_effort == "low":
+            return {
+                "thinkingBudget": DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET,
+                "includeThoughts": True,
+            }
+        elif reasoning_effort == "medium":
+            return {
+                "thinkingBudget": DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET,
+                "includeThoughts": True,
+            }
+        elif reasoning_effort == "high":
+            return {
+                "thinkingBudget": DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET,
+                "includeThoughts": True,
+            }
+        else:
+            raise ValueError(f"Invalid reasoning effort: {reasoning_effort}")
+
+    @staticmethod
+    def _map_thinking_param(
+        thinking_param: AnthropicThinkingParam,
+    ) -> GeminiThinkingConfig:
+        thinking_enabled = thinking_param.get("type") == "enabled"
+        thinking_budget = thinking_param.get("budget_tokens")
+
+        params: GeminiThinkingConfig = {}
+        if thinking_enabled:
+            params["includeThoughts"] = True
+        if thinking_budget:
+            params["thinkingBudget"] = thinking_budget
+
+        return params
+
    def map_openai_params(
        self,
        non_default_params: Dict,
@ -399,6 +451,16 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
                    optional_params["tool_choice"] = _tool_choice_value
            elif param == "seed":
                optional_params["seed"] = value
+            elif param == "reasoning_effort" and isinstance(value, str):
+                optional_params[
+                    "thinkingConfig"
+                ] = VertexGeminiConfig._map_reasoning_effort_to_thinking_budget(value)
+            elif param == "thinking":
+                optional_params[
+                    "thinkingConfig"
+                ] = VertexGeminiConfig._map_thinking_param(
+                    cast(AnthropicThinkingParam, value)
+                )
            elif param == "modalities" and isinstance(value, list):
                response_modalities = []
                for modality in value:
@ -514,19 +576,27 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):

    def get_assistant_content_message(
        self, parts: List[HttpxPartType]
-    ) -> Optional[str]:
-        _content_str = ""
+    ) -> Tuple[Optional[str], Optional[str]]:
+        content_str: Optional[str] = None
+        reasoning_content_str: Optional[str] = None
        for part in parts:
+            _content_str = ""
            if "text" in part:
                _content_str += part["text"]
            elif "inlineData" in part:  # base64 encoded image
                _content_str += "data:{};base64,{}".format(
                    part["inlineData"]["mimeType"], part["inlineData"]["data"]
                )
+            if part.get("thought") is True:
+                if reasoning_content_str is None:
+                    reasoning_content_str = ""
+                reasoning_content_str += _content_str
+            else:
+                if content_str is None:
+                    content_str = ""
+                content_str += _content_str

-        if _content_str:
-            return _content_str
-        return None
+        return content_str, reasoning_content_str

    def _transform_parts(
        self,
@ -677,6 +747,7 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
        audio_tokens: Optional[int] = None
        text_tokens: Optional[int] = None
        prompt_tokens_details: Optional[PromptTokensDetailsWrapper] = None
+        reasoning_tokens: Optional[int] = None
        if "cachedContentTokenCount" in completion_response["usageMetadata"]:
            cached_tokens = completion_response["usageMetadata"][
                "cachedContentTokenCount"
@ -687,7 +758,10 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
                    audio_tokens = detail["tokenCount"]
                elif detail["modality"] == "TEXT":
                    text_tokens = detail["tokenCount"]
-
+        if "thoughtsTokenCount" in completion_response["usageMetadata"]:
+            reasoning_tokens = completion_response["usageMetadata"][
+                "thoughtsTokenCount"
+            ]
        prompt_tokens_details = PromptTokensDetailsWrapper(
            cached_tokens=cached_tokens,
            audio_tokens=audio_tokens,
@ -703,6 +777,7 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
            ),
            total_tokens=completion_response["usageMetadata"].get("totalTokenCount", 0),
            prompt_tokens_details=prompt_tokens_details,
+            reasoning_tokens=reasoning_tokens,
        )

        return usage
@ -731,11 +806,16 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
                citation_metadata.append(candidate["citationMetadata"])

            if "parts" in candidate["content"]:
-                chat_completion_message[
-                    "content"
-                ] = VertexGeminiConfig().get_assistant_content_message(
+                (
+                    content,
+                    reasoning_content,
+                ) = VertexGeminiConfig().get_assistant_content_message(
                    parts=candidate["content"]["parts"]
                )
+                if content is not None:
+                    chat_completion_message["content"] = content
+                if reasoning_content is not None:
+                    chat_completion_message["reasoning_content"] = reasoning_content

                functions, tools = self._transform_parts(
                    parts=candidate["content"]["parts"],
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -5178,9 +5178,10 @@
        "max_audio_length_hours": 8.4,
        "max_audio_per_prompt": 1,
        "max_pdf_size_mb": 30,
-        "input_cost_per_audio_token": 0.0000001,
-        "input_cost_per_token": 0.00000015,
-        "output_cost_per_token": 0.00000060,
+        "input_cost_per_audio_token": 1e-6,
+        "input_cost_per_token": 0.15e-6,
+        "output_cost_per_token": 0.6e-6,
+        "output_cost_per_reasoning_token": 3.5e-6,
        "litellm_provider": "gemini",
        "mode": "chat",
        "rpm": 10,
@ -5188,9 +5189,39 @@
        "supports_system_messages": true,
        "supports_function_calling": true,
        "supports_vision": true,
+        "supports_reasoning": true,
        "supports_response_schema": true,
        "supports_audio_output": false,
        "supports_tool_choice": true,
+        "supported_endpoints": ["/v1/chat/completions", "/v1/completions"],
+        "supported_modalities": ["text", "image", "audio", "video"],
+        "supported_output_modalities": ["text"],
+        "source": "https://ai.google.dev/gemini-api/docs/models#gemini-2.5-flash-preview"
+    },
+    "gemini-2.5-flash-preview-04-17": {
+        "max_tokens": 65536,
+        "max_input_tokens": 1048576,
+        "max_output_tokens": 65536,
+        "max_images_per_prompt": 3000,
+        "max_videos_per_prompt": 10,
+        "max_video_length": 1,
+        "max_audio_length_hours": 8.4,
+        "max_audio_per_prompt": 1,
+        "max_pdf_size_mb": 30,
+        "input_cost_per_audio_token": 1e-6,
+        "input_cost_per_token": 0.15e-6,
+        "output_cost_per_token": 0.6e-6,
+        "output_cost_per_reasoning_token": 3.5e-6,
+        "litellm_provider": "vertex_ai-language-models",
+        "mode": "chat",
+        "supports_reasoning": true,
+        "supports_system_messages": true,
+        "supports_function_calling": true,
+        "supports_vision": true,
+        "supports_response_schema": true,
+        "supports_audio_output": false,
+        "supports_tool_choice": true,
+        "supported_endpoints": ["/v1/chat/completions", "/v1/completions", "/v1/batch"],
        "supported_modalities": ["text", "image", "audio", "video"],
        "supported_output_modalities": ["text"],
        "source": "https://ai.google.dev/gemini-api/docs/models#gemini-2.5-flash-preview"
--- a/litellm/types/llms/vertex_ai.py
+++ b/litellm/types/llms/vertex_ai.py
@ -69,6 +69,7 @@ class HttpxPartType(TypedDict, total=False):
    functionResponse: FunctionResponse
    executableCode: HttpxExecutableCode
    codeExecutionResult: HttpxCodeExecutionResult
+    thought: bool


 class HttpxContentType(TypedDict, total=False):
@ -166,6 +167,11 @@ class SafetSettingsConfig(TypedDict, total=False):
    method: HarmBlockMethod


+class GeminiThinkingConfig(TypedDict, total=False):
+    includeThoughts: bool
+    thinkingBudget: int
+
+
 class GenerationConfig(TypedDict, total=False):
    temperature: float
    top_p: float
@ -181,6 +187,7 @@ class GenerationConfig(TypedDict, total=False):
    responseLogprobs: bool
    logprobs: int
    responseModalities: List[Literal["TEXT", "IMAGE", "AUDIO", "VIDEO"]]
+    thinkingConfig: GeminiThinkingConfig


 class Tools(TypedDict, total=False):
@ -212,6 +219,7 @@ class UsageMetadata(TypedDict, total=False):
    candidatesTokenCount: int
    cachedContentTokenCount: int
    promptTokensDetails: List[PromptTokensDetails]
+    thoughtsTokenCount: int


 class CachedContent(TypedDict, total=False):
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@ -150,6 +150,7 @@ class ModelInfoBase(ProviderSpecificModelInfo, total=False):
    ]  # only for vertex ai models
    output_cost_per_image: Optional[float]
    output_vector_size: Optional[int]
+    output_cost_per_reasoning_token: Optional[float]
    output_cost_per_video_per_second: Optional[float]  # only for vertex ai models
    output_cost_per_audio_per_second: Optional[float]  # only for vertex ai models
    output_cost_per_second: Optional[float]  # for OpenAI Speech models
@ -829,8 +830,11 @@ class Usage(CompletionUsage):
        # handle reasoning_tokens
        _completion_tokens_details: Optional[CompletionTokensDetailsWrapper] = None
        if reasoning_tokens:
+            text_tokens = (
+                completion_tokens - reasoning_tokens if completion_tokens else None
+            )
            completion_tokens_details = CompletionTokensDetailsWrapper(
-                reasoning_tokens=reasoning_tokens
+                reasoning_tokens=reasoning_tokens, text_tokens=text_tokens
            )

        # Ensure completion_tokens_details is properly handled
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -516,9 +516,9 @@ def function_setup(  # noqa: PLR0915
        function_id: Optional[str] = kwargs["id"] if "id" in kwargs else None

        ## DYNAMIC CALLBACKS ##
-        dynamic_callbacks: Optional[List[Union[str, Callable, CustomLogger]]] = (
-            kwargs.pop("callbacks", None)
-        )
+        dynamic_callbacks: Optional[
+            List[Union[str, Callable, CustomLogger]]
+        ] = kwargs.pop("callbacks", None)
        all_callbacks = get_dynamic_callbacks(dynamic_callbacks=dynamic_callbacks)

        if len(all_callbacks) > 0:
@ -1202,9 +1202,9 @@ def client(original_function):  # noqa: PLR0915
                        exception=e,
                        retry_policy=kwargs.get("retry_policy"),
                    )
-                    kwargs["retry_policy"] = (
-                        reset_retry_policy()
-                    )  # prevent infinite loops
+                    kwargs[
+                        "retry_policy"
+                    ] = reset_retry_policy()  # prevent infinite loops
                litellm.num_retries = (
                    None  # set retries to None to prevent infinite loops
                )
@ -3013,16 +3013,16 @@ def get_optional_params(  # noqa: PLR0915
                    True  # so that main.py adds the function call to the prompt
                )
                if "tools" in non_default_params:
-                    optional_params["functions_unsupported_model"] = (
-                        non_default_params.pop("tools")
-                    )
+                    optional_params[
+                        "functions_unsupported_model"
+                    ] = non_default_params.pop("tools")
                    non_default_params.pop(
                        "tool_choice", None
                    )  # causes ollama requests to hang
                elif "functions" in non_default_params:
-                    optional_params["functions_unsupported_model"] = (
-                        non_default_params.pop("functions")
-                    )
+                    optional_params[
+                        "functions_unsupported_model"
+                    ] = non_default_params.pop("functions")
            elif (
                litellm.add_function_to_prompt
            ):  # if user opts to add it to prompt instead
@ -3045,10 +3045,10 @@ def get_optional_params(  # noqa: PLR0915

    if "response_format" in non_default_params:
        if provider_config is not None:
-            non_default_params["response_format"] = (
-                provider_config.get_json_schema_from_pydantic_object(
-                    response_format=non_default_params["response_format"]
-                )
+            non_default_params[
+                "response_format"
+            ] = provider_config.get_json_schema_from_pydantic_object(
+                response_format=non_default_params["response_format"]
            )
        else:
            non_default_params["response_format"] = type_to_response_format_param(
@ -4064,9 +4064,9 @@ def _count_characters(text: str) -> int:


 def get_response_string(response_obj: Union[ModelResponse, ModelResponseStream]) -> str:
-    _choices: Union[List[Union[Choices, StreamingChoices]], List[StreamingChoices]] = (
-        response_obj.choices
-    )
+    _choices: Union[
+        List[Union[Choices, StreamingChoices]], List[StreamingChoices]
+    ] = response_obj.choices

    response_str = ""
    for choice in _choices:
@ -4563,6 +4563,9 @@ def _get_model_info_helper(  # noqa: PLR0915
                output_cost_per_character=_model_info.get(
                    "output_cost_per_character", None
                ),
+                output_cost_per_reasoning_token=_model_info.get(
+                    "output_cost_per_reasoning_token", None
+                ),
                output_cost_per_token_above_128k_tokens=_model_info.get(
                    "output_cost_per_token_above_128k_tokens", None
                ),