From 36308a31beec93d674037f6eb96fe9df785ebc14 Mon Sep 17 00:00:00 2001
From: Krish Dholakia <krrishdholakia@gmail.com>
Date: Sat, 19 Apr 2025 09:20:52 -0700
Subject: [PATCH] Gemini-2.5-flash - support reasoning cost calc + return
 reasoning content (#10141)

* build(model_prices_and_context_window.json): add vertex ai gemini-2.5-flash pricing

* build(model_prices_and_context_window.json): add gemini reasoning token pricing

* fix(vertex_and_google_ai_studio_gemini.py): support counting thinking tokens for gemini

allows accurate cost calc

* fix(utils.py): add reasoning token cost calc to generic cost calc

ensures gemini-2.5-flash cost calculation is accurate

* build(model_prices_and_context_window.json): mark gemini-2.5-flash as 'supports_reasoning'

* feat(gemini/): support 'thinking' + 'reasoning_effort' params + new unit tests

allow controlling thinking effort for gemini-2.5-flash models

* test: update unit testing

* feat(vertex_and_google_ai_studio_gemini.py): return reasoning content if given in gemini response

* test: update model name

* fix: fix ruff check

* test(test_spend_management_endpoints.py): update tests to be less sensitive to new keys / updates to usage object

* fix(vertex_and_google_ai_studio_gemini.py): fix translation
---
 .gitignore                                    |   1 +
 litellm/constants.py                          |   4 +
 .../litellm_core_utils/llm_cost_calc/utils.py |  21 ++-
 litellm/llms/anthropic/chat/transformation.py |  18 ++-
 litellm/llms/gemini/chat/transformation.py    |   7 +-
 .../vertex_and_google_ai_studio_gemini.py     | 106 ++++++++++++--
 ...odel_prices_and_context_window_backup.json |  37 ++++-
 litellm/types/llms/vertex_ai.py               |   8 ++
 litellm/types/utils.py                        |   6 +-
 litellm/utils.py                              |  41 +++---
 model_prices_and_context_window.json          |  37 ++++-
 .../llm_cost_calc/test_llm_cost_calc_utils.py |  53 ++++++-
 ...test_vertex_and_google_ai_studio_gemini.py |  24 +++-
 .../test_spend_management_endpoints.py        | 129 ++++++++++++------
 tests/llm_translation/base_llm_unit_tests.py  |  45 ++++++
 tests/llm_translation/test_gemini.py          |   4 +
 16 files changed, 453 insertions(+), 88 deletions(-)

diff --git a/.gitignore b/.gitignore
index 81fff2d342..4259b80f55 100644
--- a/.gitignore
+++ b/.gitignore
@@ -86,3 +86,4 @@ litellm/proxy/db/migrations/0_init/migration.sql
 litellm/proxy/db/migrations/*
 litellm/proxy/migrations/*config.yaml
 litellm/proxy/migrations/*
+tests/litellm/litellm_core_utils/llm_cost_calc/log.txt
diff --git a/litellm/constants.py b/litellm/constants.py
index 9c30dc06a2..f48ce97afe 100644
--- a/litellm/constants.py
+++ b/litellm/constants.py
@@ -21,6 +21,10 @@ DEFAULT_MAX_TOKENS = 256  # used when providers need a default
 MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 1024  # 1MB = 1024KB
 SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000  # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic.
 
+DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET = 1024
+DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET = 2048
+DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET = 4096
+
 ########## Networking constants ##############################################################
 _DEFAULT_TTL_FOR_HTTPX_CLIENTS = 3600  # 1 hour, re-use the same httpx client for 1 hour
 
diff --git a/litellm/litellm_core_utils/llm_cost_calc/utils.py b/litellm/litellm_core_utils/llm_cost_calc/utils.py
index ae5eb286e4..15b8282820 100644
--- a/litellm/litellm_core_utils/llm_cost_calc/utils.py
+++ b/litellm/litellm_core_utils/llm_cost_calc/utils.py
@@ -267,6 +267,7 @@ def generic_cost_per_token(
     ## CALCULATE OUTPUT COST
     text_tokens = usage.completion_tokens
     audio_tokens = 0
+    reasoning_tokens = 0
     if usage.completion_tokens_details is not None:
         audio_tokens = (
             cast(
@@ -282,7 +283,13 @@ def generic_cost_per_token(
             )
             or usage.completion_tokens  # default to completion tokens, if this field is not set
         )
-
+        reasoning_tokens = (
+            cast(
+                Optional[int],
+                getattr(usage.completion_tokens_details, "reasoning_tokens", 0),
+            )
+            or 0
+        )
     ## TEXT COST
     completion_cost = float(text_tokens) * completion_base_cost
 
@@ -290,6 +297,10 @@ def generic_cost_per_token(
         "output_cost_per_audio_token"
     )
 
+    _output_cost_per_reasoning_token: Optional[float] = model_info.get(
+        "output_cost_per_reasoning_token"
+    )
+
     ## AUDIO COST
     if (
         _output_cost_per_audio_token is not None
@@ -298,4 +309,12 @@ def generic_cost_per_token(
     ):
         completion_cost += float(audio_tokens) * _output_cost_per_audio_token
 
+    ## REASONING COST
+    if (
+        _output_cost_per_reasoning_token is not None
+        and reasoning_tokens
+        and reasoning_tokens > 0
+    ):
+        completion_cost += float(reasoning_tokens) * _output_cost_per_reasoning_token
+
     return prompt_cost, completion_cost
diff --git a/litellm/llms/anthropic/chat/transformation.py b/litellm/llms/anthropic/chat/transformation.py
index 590931321d..2bf9d0d992 100644
--- a/litellm/llms/anthropic/chat/transformation.py
+++ b/litellm/llms/anthropic/chat/transformation.py
@@ -7,6 +7,9 @@ import httpx
 import litellm
 from litellm.constants import (
     DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS,
+    DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET,
+    DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET,
+    DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET,
     RESPONSE_FORMAT_TOOL_NAME,
 )
 from litellm.litellm_core_utils.core_helpers import map_finish_reason
@@ -276,11 +279,20 @@ class AnthropicConfig(AnthropicModelInfo, BaseConfig):
         if reasoning_effort is None:
             return None
         elif reasoning_effort == "low":
-            return AnthropicThinkingParam(type="enabled", budget_tokens=1024)
+            return AnthropicThinkingParam(
+                type="enabled",
+                budget_tokens=DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET,
+            )
         elif reasoning_effort == "medium":
-            return AnthropicThinkingParam(type="enabled", budget_tokens=2048)
+            return AnthropicThinkingParam(
+                type="enabled",
+                budget_tokens=DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET,
+            )
         elif reasoning_effort == "high":
-            return AnthropicThinkingParam(type="enabled", budget_tokens=4096)
+            return AnthropicThinkingParam(
+                type="enabled",
+                budget_tokens=DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET,
+            )
         else:
             raise ValueError(f"Unmapped reasoning effort: {reasoning_effort}")
 
diff --git a/litellm/llms/gemini/chat/transformation.py b/litellm/llms/gemini/chat/transformation.py
index 795333d598..dc65c46455 100644
--- a/litellm/llms/gemini/chat/transformation.py
+++ b/litellm/llms/gemini/chat/transformation.py
@@ -7,6 +7,7 @@ from litellm.litellm_core_utils.prompt_templates.factory import (
 )
 from litellm.types.llms.openai import AllMessageValues
 from litellm.types.llms.vertex_ai import ContentType, PartType
+from litellm.utils import supports_reasoning
 
 from ...vertex_ai.gemini.transformation import _gemini_convert_messages_with_history
 from ...vertex_ai.gemini.vertex_and_google_ai_studio_gemini import VertexGeminiConfig
@@ -67,7 +68,7 @@ class GoogleAIStudioGeminiConfig(VertexGeminiConfig):
         return super().get_config()
 
     def get_supported_openai_params(self, model: str) -> List[str]:
-        return [
+        supported_params = [
             "temperature",
             "top_p",
             "max_tokens",
@@ -83,6 +84,10 @@ class GoogleAIStudioGeminiConfig(VertexGeminiConfig):
             "frequency_penalty",
             "modalities",
         ]
+        if supports_reasoning(model):
+            supported_params.append("reasoning_effort")
+            supported_params.append("thinking")
+        return supported_params
 
     def map_openai_params(
         self,
diff --git a/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py b/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py
index 3a58bb2c6d..ba57c8d225 100644
--- a/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py
+++ b/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py
@@ -24,6 +24,11 @@ import litellm
 import litellm.litellm_core_utils
 import litellm.litellm_core_utils.litellm_logging
 from litellm import verbose_logger
+from litellm.constants import (
+    DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET,
+    DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET,
+    DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET,
+)
 from litellm.litellm_core_utils.core_helpers import map_finish_reason
 from litellm.llms.base_llm.chat.transformation import BaseConfig, BaseLLMException
 from litellm.llms.custom_httpx.http_handler import (
@@ -31,6 +36,7 @@ from litellm.llms.custom_httpx.http_handler import (
     HTTPHandler,
     get_async_httpx_client,
 )
+from litellm.types.llms.anthropic import AnthropicThinkingParam
 from litellm.types.llms.openai import (
     AllMessageValues,
     ChatCompletionResponseMessage,
@@ -45,6 +51,7 @@ from litellm.types.llms.vertex_ai import (
     ContentType,
     FunctionCallingConfig,
     FunctionDeclaration,
+    GeminiThinkingConfig,
     GenerateContentResponseBody,
     HttpxPartType,
     LogprobsResult,
@@ -59,7 +66,7 @@ from litellm.types.utils import (
     TopLogprob,
     Usage,
 )
-from litellm.utils import CustomStreamWrapper, ModelResponse
+from litellm.utils import CustomStreamWrapper, ModelResponse, supports_reasoning
 
 from ....utils import _remove_additional_properties, _remove_strict_from_schema
 from ..common_utils import VertexAIError, _build_vertex_schema
@@ -190,7 +197,7 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
         return super().get_config()
 
     def get_supported_openai_params(self, model: str) -> List[str]:
-        return [
+        supported_params = [
             "temperature",
             "top_p",
             "max_tokens",
@@ -210,6 +217,10 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
             "top_logprobs",
             "modalities",
         ]
+        if supports_reasoning(model):
+            supported_params.append("reasoning_effort")
+            supported_params.append("thinking")
+        return supported_params
 
     def map_tool_choice_values(
         self, model: str, tool_choice: Union[str, dict]
@@ -313,10 +324,14 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
         if isinstance(old_schema, list):
             for item in old_schema:
                 if isinstance(item, dict):
-                    item = _build_vertex_schema(parameters=item, add_property_ordering=True)
+                    item = _build_vertex_schema(
+                        parameters=item, add_property_ordering=True
+                    )
 
         elif isinstance(old_schema, dict):
-            old_schema = _build_vertex_schema(parameters=old_schema, add_property_ordering=True)
+            old_schema = _build_vertex_schema(
+                parameters=old_schema, add_property_ordering=True
+            )
         return old_schema
 
     def apply_response_schema_transformation(self, value: dict, optional_params: dict):
@@ -343,6 +358,43 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
                 value=optional_params["response_schema"]
             )
 
+    @staticmethod
+    def _map_reasoning_effort_to_thinking_budget(
+        reasoning_effort: str,
+    ) -> GeminiThinkingConfig:
+        if reasoning_effort == "low":
+            return {
+                "thinkingBudget": DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET,
+                "includeThoughts": True,
+            }
+        elif reasoning_effort == "medium":
+            return {
+                "thinkingBudget": DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET,
+                "includeThoughts": True,
+            }
+        elif reasoning_effort == "high":
+            return {
+                "thinkingBudget": DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET,
+                "includeThoughts": True,
+            }
+        else:
+            raise ValueError(f"Invalid reasoning effort: {reasoning_effort}")
+
+    @staticmethod
+    def _map_thinking_param(
+        thinking_param: AnthropicThinkingParam,
+    ) -> GeminiThinkingConfig:
+        thinking_enabled = thinking_param.get("type") == "enabled"
+        thinking_budget = thinking_param.get("budget_tokens")
+
+        params: GeminiThinkingConfig = {}
+        if thinking_enabled:
+            params["includeThoughts"] = True
+        if thinking_budget:
+            params["thinkingBudget"] = thinking_budget
+
+        return params
+
     def map_openai_params(
         self,
         non_default_params: Dict,
@@ -399,6 +451,16 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
                     optional_params["tool_choice"] = _tool_choice_value
             elif param == "seed":
                 optional_params["seed"] = value
+            elif param == "reasoning_effort" and isinstance(value, str):
+                optional_params[
+                    "thinkingConfig"
+                ] = VertexGeminiConfig._map_reasoning_effort_to_thinking_budget(value)
+            elif param == "thinking":
+                optional_params[
+                    "thinkingConfig"
+                ] = VertexGeminiConfig._map_thinking_param(
+                    cast(AnthropicThinkingParam, value)
+                )
             elif param == "modalities" and isinstance(value, list):
                 response_modalities = []
                 for modality in value:
@@ -514,19 +576,27 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
 
     def get_assistant_content_message(
         self, parts: List[HttpxPartType]
-    ) -> Optional[str]:
-        _content_str = ""
+    ) -> Tuple[Optional[str], Optional[str]]:
+        content_str: Optional[str] = None
+        reasoning_content_str: Optional[str] = None
         for part in parts:
+            _content_str = ""
             if "text" in part:
                 _content_str += part["text"]
             elif "inlineData" in part:  # base64 encoded image
                 _content_str += "data:{};base64,{}".format(
                     part["inlineData"]["mimeType"], part["inlineData"]["data"]
                 )
+            if part.get("thought") is True:
+                if reasoning_content_str is None:
+                    reasoning_content_str = ""
+                reasoning_content_str += _content_str
+            else:
+                if content_str is None:
+                    content_str = ""
+                content_str += _content_str
 
-        if _content_str:
-            return _content_str
-        return None
+        return content_str, reasoning_content_str
 
     def _transform_parts(
         self,
@@ -677,6 +747,7 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
         audio_tokens: Optional[int] = None
         text_tokens: Optional[int] = None
         prompt_tokens_details: Optional[PromptTokensDetailsWrapper] = None
+        reasoning_tokens: Optional[int] = None
         if "cachedContentTokenCount" in completion_response["usageMetadata"]:
             cached_tokens = completion_response["usageMetadata"][
                 "cachedContentTokenCount"
@@ -687,7 +758,10 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
                     audio_tokens = detail["tokenCount"]
                 elif detail["modality"] == "TEXT":
                     text_tokens = detail["tokenCount"]
-
+        if "thoughtsTokenCount" in completion_response["usageMetadata"]:
+            reasoning_tokens = completion_response["usageMetadata"][
+                "thoughtsTokenCount"
+            ]
         prompt_tokens_details = PromptTokensDetailsWrapper(
             cached_tokens=cached_tokens,
             audio_tokens=audio_tokens,
@@ -703,6 +777,7 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
             ),
             total_tokens=completion_response["usageMetadata"].get("totalTokenCount", 0),
             prompt_tokens_details=prompt_tokens_details,
+            reasoning_tokens=reasoning_tokens,
         )
 
         return usage
@@ -731,11 +806,16 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
                 citation_metadata.append(candidate["citationMetadata"])
 
             if "parts" in candidate["content"]:
-                chat_completion_message[
-                    "content"
-                ] = VertexGeminiConfig().get_assistant_content_message(
+                (
+                    content,
+                    reasoning_content,
+                ) = VertexGeminiConfig().get_assistant_content_message(
                     parts=candidate["content"]["parts"]
                 )
+                if content is not None:
+                    chat_completion_message["content"] = content
+                if reasoning_content is not None:
+                    chat_completion_message["reasoning_content"] = reasoning_content
 
                 functions, tools = self._transform_parts(
                     parts=candidate["content"]["parts"],
diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json
index 4b0d657e05..c2456ac3db 100644
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@@ -5178,9 +5178,10 @@
         "max_audio_length_hours": 8.4,
         "max_audio_per_prompt": 1,
         "max_pdf_size_mb": 30,
-        "input_cost_per_audio_token": 0.0000001,
-        "input_cost_per_token": 0.00000015,
-        "output_cost_per_token": 0.00000060,
+        "input_cost_per_audio_token": 1e-6,
+        "input_cost_per_token": 0.15e-6,
+        "output_cost_per_token": 0.6e-6,
+        "output_cost_per_reasoning_token": 3.5e-6,
         "litellm_provider": "gemini",
         "mode": "chat",
         "rpm": 10,
@@ -5188,9 +5189,39 @@
         "supports_system_messages": true,
         "supports_function_calling": true,
         "supports_vision": true,
+        "supports_reasoning": true,
         "supports_response_schema": true,
         "supports_audio_output": false,
         "supports_tool_choice": true,
+        "supported_endpoints": ["/v1/chat/completions", "/v1/completions"],
+        "supported_modalities": ["text", "image", "audio", "video"],
+        "supported_output_modalities": ["text"],
+        "source": "https://ai.google.dev/gemini-api/docs/models#gemini-2.5-flash-preview"
+    },
+    "gemini-2.5-flash-preview-04-17": {
+        "max_tokens": 65536,
+        "max_input_tokens": 1048576,
+        "max_output_tokens": 65536,
+        "max_images_per_prompt": 3000,
+        "max_videos_per_prompt": 10,
+        "max_video_length": 1,
+        "max_audio_length_hours": 8.4,
+        "max_audio_per_prompt": 1,
+        "max_pdf_size_mb": 30,
+        "input_cost_per_audio_token": 1e-6,
+        "input_cost_per_token": 0.15e-6,
+        "output_cost_per_token": 0.6e-6,
+        "output_cost_per_reasoning_token": 3.5e-6,
+        "litellm_provider": "vertex_ai-language-models",
+        "mode": "chat",
+        "supports_reasoning": true,
+        "supports_system_messages": true,
+        "supports_function_calling": true,
+        "supports_vision": true,
+        "supports_response_schema": true,
+        "supports_audio_output": false,
+        "supports_tool_choice": true,
+        "supported_endpoints": ["/v1/chat/completions", "/v1/completions", "/v1/batch"],
         "supported_modalities": ["text", "image", "audio", "video"],
         "supported_output_modalities": ["text"],
         "source": "https://ai.google.dev/gemini-api/docs/models#gemini-2.5-flash-preview"
diff --git a/litellm/types/llms/vertex_ai.py b/litellm/types/llms/vertex_ai.py
index 55273371fc..6f3707c657 100644
--- a/litellm/types/llms/vertex_ai.py
+++ b/litellm/types/llms/vertex_ai.py
@@ -69,6 +69,7 @@ class HttpxPartType(TypedDict, total=False):
     functionResponse: FunctionResponse
     executableCode: HttpxExecutableCode
     codeExecutionResult: HttpxCodeExecutionResult
+    thought: bool
 
 
 class HttpxContentType(TypedDict, total=False):
@@ -166,6 +167,11 @@ class SafetSettingsConfig(TypedDict, total=False):
     method: HarmBlockMethod
 
 
+class GeminiThinkingConfig(TypedDict, total=False):
+    includeThoughts: bool
+    thinkingBudget: int
+
+
 class GenerationConfig(TypedDict, total=False):
     temperature: float
     top_p: float
@@ -181,6 +187,7 @@ class GenerationConfig(TypedDict, total=False):
     responseLogprobs: bool
     logprobs: int
     responseModalities: List[Literal["TEXT", "IMAGE", "AUDIO", "VIDEO"]]
+    thinkingConfig: GeminiThinkingConfig
 
 
 class Tools(TypedDict, total=False):
@@ -212,6 +219,7 @@ class UsageMetadata(TypedDict, total=False):
     candidatesTokenCount: int
     cachedContentTokenCount: int
     promptTokensDetails: List[PromptTokensDetails]
+    thoughtsTokenCount: int
 
 
 class CachedContent(TypedDict, total=False):
diff --git a/litellm/types/utils.py b/litellm/types/utils.py
index 35a584b6cf..ac626d4657 100644
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@@ -150,6 +150,7 @@ class ModelInfoBase(ProviderSpecificModelInfo, total=False):
     ]  # only for vertex ai models
     output_cost_per_image: Optional[float]
     output_vector_size: Optional[int]
+    output_cost_per_reasoning_token: Optional[float]
     output_cost_per_video_per_second: Optional[float]  # only for vertex ai models
     output_cost_per_audio_per_second: Optional[float]  # only for vertex ai models
     output_cost_per_second: Optional[float]  # for OpenAI Speech models
@@ -829,8 +830,11 @@ class Usage(CompletionUsage):
         # handle reasoning_tokens
         _completion_tokens_details: Optional[CompletionTokensDetailsWrapper] = None
         if reasoning_tokens:
+            text_tokens = (
+                completion_tokens - reasoning_tokens if completion_tokens else None
+            )
             completion_tokens_details = CompletionTokensDetailsWrapper(
-                reasoning_tokens=reasoning_tokens
+                reasoning_tokens=reasoning_tokens, text_tokens=text_tokens
             )
 
         # Ensure completion_tokens_details is properly handled
diff --git a/litellm/utils.py b/litellm/utils.py
index 141eadf624..6b52fe91fa 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -516,9 +516,9 @@ def function_setup(  # noqa: PLR0915
         function_id: Optional[str] = kwargs["id"] if "id" in kwargs else None
 
         ## DYNAMIC CALLBACKS ##
-        dynamic_callbacks: Optional[List[Union[str, Callable, CustomLogger]]] = (
-            kwargs.pop("callbacks", None)
-        )
+        dynamic_callbacks: Optional[
+            List[Union[str, Callable, CustomLogger]]
+        ] = kwargs.pop("callbacks", None)
         all_callbacks = get_dynamic_callbacks(dynamic_callbacks=dynamic_callbacks)
 
         if len(all_callbacks) > 0:
@@ -1202,9 +1202,9 @@ def client(original_function):  # noqa: PLR0915
                         exception=e,
                         retry_policy=kwargs.get("retry_policy"),
                     )
-                    kwargs["retry_policy"] = (
-                        reset_retry_policy()
-                    )  # prevent infinite loops
+                    kwargs[
+                        "retry_policy"
+                    ] = reset_retry_policy()  # prevent infinite loops
                 litellm.num_retries = (
                     None  # set retries to None to prevent infinite loops
                 )
@@ -3013,16 +3013,16 @@ def get_optional_params(  # noqa: PLR0915
                     True  # so that main.py adds the function call to the prompt
                 )
                 if "tools" in non_default_params:
-                    optional_params["functions_unsupported_model"] = (
-                        non_default_params.pop("tools")
-                    )
+                    optional_params[
+                        "functions_unsupported_model"
+                    ] = non_default_params.pop("tools")
                     non_default_params.pop(
                         "tool_choice", None
                     )  # causes ollama requests to hang
                 elif "functions" in non_default_params:
-                    optional_params["functions_unsupported_model"] = (
-                        non_default_params.pop("functions")
-                    )
+                    optional_params[
+                        "functions_unsupported_model"
+                    ] = non_default_params.pop("functions")
             elif (
                 litellm.add_function_to_prompt
             ):  # if user opts to add it to prompt instead
@@ -3045,10 +3045,10 @@ def get_optional_params(  # noqa: PLR0915
 
     if "response_format" in non_default_params:
         if provider_config is not None:
-            non_default_params["response_format"] = (
-                provider_config.get_json_schema_from_pydantic_object(
-                    response_format=non_default_params["response_format"]
-                )
+            non_default_params[
+                "response_format"
+            ] = provider_config.get_json_schema_from_pydantic_object(
+                response_format=non_default_params["response_format"]
             )
         else:
             non_default_params["response_format"] = type_to_response_format_param(
@@ -4064,9 +4064,9 @@ def _count_characters(text: str) -> int:
 
 
 def get_response_string(response_obj: Union[ModelResponse, ModelResponseStream]) -> str:
-    _choices: Union[List[Union[Choices, StreamingChoices]], List[StreamingChoices]] = (
-        response_obj.choices
-    )
+    _choices: Union[
+        List[Union[Choices, StreamingChoices]], List[StreamingChoices]
+    ] = response_obj.choices
 
     response_str = ""
     for choice in _choices:
@@ -4563,6 +4563,9 @@ def _get_model_info_helper(  # noqa: PLR0915
                 output_cost_per_character=_model_info.get(
                     "output_cost_per_character", None
                 ),
+                output_cost_per_reasoning_token=_model_info.get(
+                    "output_cost_per_reasoning_token", None
+                ),
                 output_cost_per_token_above_128k_tokens=_model_info.get(
                     "output_cost_per_token_above_128k_tokens", None
                 ),
diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json
index 4b0d657e05..c2456ac3db 100644
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@@ -5178,9 +5178,10 @@
         "max_audio_length_hours": 8.4,
         "max_audio_per_prompt": 1,
         "max_pdf_size_mb": 30,
-        "input_cost_per_audio_token": 0.0000001,
-        "input_cost_per_token": 0.00000015,
-        "output_cost_per_token": 0.00000060,
+        "input_cost_per_audio_token": 1e-6,
+        "input_cost_per_token": 0.15e-6,
+        "output_cost_per_token": 0.6e-6,
+        "output_cost_per_reasoning_token": 3.5e-6,
         "litellm_provider": "gemini",
         "mode": "chat",
         "rpm": 10,
@@ -5188,9 +5189,39 @@
         "supports_system_messages": true,
         "supports_function_calling": true,
         "supports_vision": true,
+        "supports_reasoning": true,
         "supports_response_schema": true,
         "supports_audio_output": false,
         "supports_tool_choice": true,
+        "supported_endpoints": ["/v1/chat/completions", "/v1/completions"],
+        "supported_modalities": ["text", "image", "audio", "video"],
+        "supported_output_modalities": ["text"],
+        "source": "https://ai.google.dev/gemini-api/docs/models#gemini-2.5-flash-preview"
+    },
+    "gemini-2.5-flash-preview-04-17": {
+        "max_tokens": 65536,
+        "max_input_tokens": 1048576,
+        "max_output_tokens": 65536,
+        "max_images_per_prompt": 3000,
+        "max_videos_per_prompt": 10,
+        "max_video_length": 1,
+        "max_audio_length_hours": 8.4,
+        "max_audio_per_prompt": 1,
+        "max_pdf_size_mb": 30,
+        "input_cost_per_audio_token": 1e-6,
+        "input_cost_per_token": 0.15e-6,
+        "output_cost_per_token": 0.6e-6,
+        "output_cost_per_reasoning_token": 3.5e-6,
+        "litellm_provider": "vertex_ai-language-models",
+        "mode": "chat",
+        "supports_reasoning": true,
+        "supports_system_messages": true,
+        "supports_function_calling": true,
+        "supports_vision": true,
+        "supports_response_schema": true,
+        "supports_audio_output": false,
+        "supports_tool_choice": true,
+        "supported_endpoints": ["/v1/chat/completions", "/v1/completions", "/v1/batch"],
         "supported_modalities": ["text", "image", "audio", "video"],
         "supported_output_modalities": ["text"],
         "source": "https://ai.google.dev/gemini-api/docs/models#gemini-2.5-flash-preview"
diff --git a/tests/litellm/litellm_core_utils/llm_cost_calc/test_llm_cost_calc_utils.py b/tests/litellm/litellm_core_utils/llm_cost_calc/test_llm_cost_calc_utils.py
index 8f8f043935..ab501f5821 100644
--- a/tests/litellm/litellm_core_utils/llm_cost_calc/test_llm_cost_calc_utils.py
+++ b/tests/litellm/litellm_core_utils/llm_cost_calc/test_llm_cost_calc_utils.py
@@ -10,7 +10,13 @@ from litellm.litellm_core_utils.llm_cost_calc.tool_call_cost_tracking import (
     StandardBuiltInToolCostTracking,
 )
 from litellm.types.llms.openai import FileSearchTool, WebSearchOptions
-from litellm.types.utils import ModelInfo, ModelResponse, StandardBuiltInToolsParams
+from litellm.types.utils import (
+    CompletionTokensDetailsWrapper,
+    ModelInfo,
+    ModelResponse,
+    PromptTokensDetailsWrapper,
+    StandardBuiltInToolsParams,
+)
 
 sys.path.insert(
     0, os.path.abspath("../../..")
@@ -20,6 +26,51 @@ from litellm.litellm_core_utils.llm_cost_calc.utils import generic_cost_per_toke
 from litellm.types.utils import Usage
 
 
+def test_reasoning_tokens_gemini():
+    model = "gemini-2.5-flash-preview-04-17"
+    custom_llm_provider = "gemini"
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
+    litellm.model_cost = litellm.get_model_cost_map(url="")
+
+    usage = Usage(
+        completion_tokens=1578,
+        prompt_tokens=17,
+        total_tokens=1595,
+        completion_tokens_details=CompletionTokensDetailsWrapper(
+            accepted_prediction_tokens=None,
+            audio_tokens=None,
+            reasoning_tokens=952,
+            rejected_prediction_tokens=None,
+            text_tokens=626,
+        ),
+        prompt_tokens_details=PromptTokensDetailsWrapper(
+            audio_tokens=None, cached_tokens=None, text_tokens=17, image_tokens=None
+        ),
+    )
+    model_cost_map = litellm.model_cost[model]
+    prompt_cost, completion_cost = generic_cost_per_token(
+        model=model,
+        usage=usage,
+        custom_llm_provider=custom_llm_provider,
+    )
+
+    assert round(prompt_cost, 10) == round(
+        model_cost_map["input_cost_per_token"] * usage.prompt_tokens,
+        10,
+    )
+    assert round(completion_cost, 10) == round(
+        (
+            model_cost_map["output_cost_per_token"]
+            * usage.completion_tokens_details.text_tokens
+        )
+        + (
+            model_cost_map["output_cost_per_reasoning_token"]
+            * usage.completion_tokens_details.reasoning_tokens
+        ),
+        10,
+    )
+
+
 def test_generic_cost_per_token_above_200k_tokens():
     model = "gemini-2.5-pro-exp-03-25"
     custom_llm_provider = "vertex_ai"
diff --git a/tests/litellm/llms/vertex_ai/gemini/test_vertex_and_google_ai_studio_gemini.py b/tests/litellm/llms/vertex_ai/gemini/test_vertex_and_google_ai_studio_gemini.py
index 41a37e2e57..bb00ff3ba0 100644
--- a/tests/litellm/llms/vertex_ai/gemini/test_vertex_and_google_ai_studio_gemini.py
+++ b/tests/litellm/llms/vertex_ai/gemini/test_vertex_and_google_ai_studio_gemini.py
@@ -1,7 +1,9 @@
 import asyncio
+from typing import List, cast
 from unittest.mock import MagicMock
 
 import pytest
+from pydantic import BaseModel
 
 import litellm
 from litellm import ModelResponse
@@ -9,8 +11,6 @@ from litellm.llms.vertex_ai.gemini.vertex_and_google_ai_studio_gemini import (
     VertexGeminiConfig,
 )
 from litellm.types.utils import ChoiceLogprobs
-from pydantic import BaseModel
-from typing import List, cast
 
 
 def test_top_logprobs():
@@ -66,7 +66,6 @@ def test_get_model_name_from_gemini_spec_model():
     assert result == "ft-uuid-123"
 
 
-
 def test_vertex_ai_response_schema_dict():
     v = VertexGeminiConfig()
     transformed_request = v.map_openai_params(
@@ -221,3 +220,22 @@ def test_vertex_ai_retain_property_ordering():
     schema = transformed_request["response_schema"]
     # should leave existing value alone, despite dictionary ordering
     assert schema["propertyOrdering"] == ["thought", "output"]
+
+
+def test_vertex_ai_thinking_output_part():
+    from litellm.llms.vertex_ai.gemini.vertex_and_google_ai_studio_gemini import (
+        VertexGeminiConfig,
+    )
+    from litellm.types.llms.vertex_ai import HttpxPartType
+
+    v = VertexGeminiConfig()
+    parts = [
+        HttpxPartType(
+            thought=True,
+            text="I'm thinking...",
+        ),
+        HttpxPartType(text="Hello world"),
+    ]
+    content, reasoning_content = v.get_assistant_content_message(parts=parts)
+    assert content == "Hello world"
+    assert reasoning_content == "I'm thinking..."
diff --git a/tests/litellm/proxy/spend_tracking/test_spend_management_endpoints.py b/tests/litellm/proxy/spend_tracking/test_spend_management_endpoints.py
index 080aa3bd16..b2e597fb7c 100644
--- a/tests/litellm/proxy/spend_tracking/test_spend_management_endpoints.py
+++ b/tests/litellm/proxy/spend_tracking/test_spend_management_endpoints.py
@@ -20,6 +20,16 @@ from litellm.proxy.hooks.proxy_track_cost_callback import _ProxyDBLogger
 from litellm.proxy.proxy_server import app, prisma_client
 from litellm.router import Router
 
+ignored_keys = [
+    "request_id",
+    "startTime",
+    "endTime",
+    "completionStartTime",
+    "endTime",
+    "metadata.model_map_information",
+    "metadata.usage_object",
+]
+
 
 @pytest.fixture
 def client():
@@ -457,7 +467,7 @@ class TestSpendLogsPayload:
                     "model": "gpt-4o",
                     "user": "",
                     "team_id": "",
-                    "metadata": '{"applied_guardrails": [], "batch_models": null, "mcp_tool_call_metadata": null, "usage_object": {"completion_tokens": 20, "prompt_tokens": 10, "total_tokens": 30, "completion_tokens_details": null, "prompt_tokens_details": null}, "model_map_information": {"model_map_key": "gpt-4o", "model_map_value": {"key": "gpt-4o", "max_tokens": 16384, "max_input_tokens": 128000, "max_output_tokens": 16384, "input_cost_per_token": 2.5e-06, "cache_creation_input_token_cost": null, "cache_read_input_token_cost": 1.25e-06, "input_cost_per_character": null, "input_cost_per_token_above_128k_tokens": null, "input_cost_per_token_above_200k_tokens": null, "input_cost_per_query": null, "input_cost_per_second": null, "input_cost_per_audio_token": null, "input_cost_per_token_batches": 1.25e-06, "output_cost_per_token_batches": 5e-06, "output_cost_per_token": 1e-05, "output_cost_per_audio_token": null, "output_cost_per_character": null, "output_cost_per_token_above_128k_tokens": null, "output_cost_per_character_above_128k_tokens": null, "output_cost_per_token_above_200k_tokens": null, "output_cost_per_second": null, "output_cost_per_image": null, "output_vector_size": null, "litellm_provider": "openai", "mode": "chat", "supports_system_messages": true, "supports_response_schema": true, "supports_vision": true, "supports_function_calling": true, "supports_tool_choice": true, "supports_assistant_prefill": false, "supports_prompt_caching": true, "supports_audio_input": false, "supports_audio_output": false, "supports_pdf_input": false, "supports_embedding_image_input": false, "supports_native_streaming": null, "supports_web_search": true, "supports_reasoning": false, "search_context_cost_per_query": {"search_context_size_low": 0.03, "search_context_size_medium": 0.035, "search_context_size_high": 0.05}, "tpm": null, "rpm": null, "supported_openai_params": ["frequency_penalty", "logit_bias", "logprobs", "top_logprobs", "max_tokens", "max_completion_tokens", "modalities", "prediction", "n", "presence_penalty", "seed", "stop", "stream", "stream_options", "temperature", "top_p", "tools", "tool_choice", "function_call", "functions", "max_retries", "extra_headers", "parallel_tool_calls", "audio", "response_format", "user"]}}, "additional_usage_values": {"completion_tokens_details": null, "prompt_tokens_details": null}}',
+                    "metadata": '{"applied_guardrails": [], "batch_models": null, "mcp_tool_call_metadata": null, "usage_object": {"completion_tokens": 20, "prompt_tokens": 10, "total_tokens": 30, "completion_tokens_details": null, "prompt_tokens_details": null}, "model_map_information": {"model_map_key": "gpt-4o", "model_map_value": {"key": "gpt-4o", "max_tokens": 16384, "max_input_tokens": 128000, "max_output_tokens": 16384, "input_cost_per_token": 2.5e-06, "cache_creation_input_token_cost": null, "cache_read_input_token_cost": 1.25e-06, "input_cost_per_character": null, "input_cost_per_token_above_128k_tokens": null, "input_cost_per_token_above_200k_tokens": null, "input_cost_per_query": null, "input_cost_per_second": null, "input_cost_per_audio_token": null, "input_cost_per_token_batches": 1.25e-06, "output_cost_per_token_batches": 5e-06, "output_cost_per_token": 1e-05, "output_cost_per_audio_token": null, "output_cost_per_character": null, "output_cost_per_token_above_128k_tokens": null, "output_cost_per_character_above_128k_tokens": null, "output_cost_per_token_above_200k_tokens": null, "output_cost_per_second": null, "output_cost_per_reasoning_token": null, "output_cost_per_image": null, "output_vector_size": null, "litellm_provider": "openai", "mode": "chat", "supports_system_messages": true, "supports_response_schema": true, "supports_vision": true, "supports_function_calling": true, "supports_tool_choice": true, "supports_assistant_prefill": false, "supports_prompt_caching": true, "supports_audio_input": false, "supports_audio_output": false, "supports_pdf_input": false, "supports_embedding_image_input": false, "supports_native_streaming": null, "supports_web_search": true, "supports_reasoning": false, "search_context_cost_per_query": {"search_context_size_low": 0.03, "search_context_size_medium": 0.035, "search_context_size_high": 0.05}, "tpm": null, "rpm": null, "supported_openai_params": ["frequency_penalty", "logit_bias", "logprobs", "top_logprobs", "max_tokens", "max_completion_tokens", "modalities", "prediction", "n", "presence_penalty", "seed", "stop", "stream", "stream_options", "temperature", "top_p", "tools", "tool_choice", "function_call", "functions", "max_retries", "extra_headers", "parallel_tool_calls", "audio", "response_format", "user"]}}, "additional_usage_values": {"completion_tokens_details": null, "prompt_tokens_details": null}}',
                     "cache_key": "Cache OFF",
                     "spend": 0.00022500000000000002,
                     "total_tokens": 30,
@@ -475,19 +485,11 @@ class TestSpendLogsPayload:
                 }
             )
 
-            for key, value in expected_payload.items():
-                if key in [
-                    "request_id",
-                    "startTime",
-                    "endTime",
-                    "completionStartTime",
-                    "endTime",
-                ]:
-                    assert payload[key] is not None
-                else:
-                    assert (
-                        payload[key] == value
-                    ), f"Expected {key} to be {value}, but got {payload[key]}"
+            differences = _compare_nested_dicts(
+                payload, expected_payload, ignore_keys=ignored_keys
+            )
+            if differences:
+                assert False, f"Dictionary mismatch: {differences}"
 
     def mock_anthropic_response(*args, **kwargs):
         mock_response = MagicMock()
@@ -573,19 +575,11 @@ class TestSpendLogsPayload:
                 }
             )
 
-            for key, value in expected_payload.items():
-                if key in [
-                    "request_id",
-                    "startTime",
-                    "endTime",
-                    "completionStartTime",
-                    "endTime",
-                ]:
-                    assert payload[key] is not None
-                else:
-                    assert (
-                        payload[key] == value
-                    ), f"Expected {key} to be {value}, but got {payload[key]}"
+            differences = _compare_nested_dicts(
+                payload, expected_payload, ignore_keys=ignored_keys
+            )
+            if differences:
+                assert False, f"Dictionary mismatch: {differences}"
 
     @pytest.mark.asyncio
     async def test_spend_logs_payload_success_log_with_router(self):
@@ -669,16 +663,71 @@ class TestSpendLogsPayload:
                 }
             )
 
-            for key, value in expected_payload.items():
-                if key in [
-                    "request_id",
-                    "startTime",
-                    "endTime",
-                    "completionStartTime",
-                    "endTime",
-                ]:
-                    assert payload[key] is not None
-                else:
-                    assert (
-                        payload[key] == value
-                    ), f"Expected {key} to be {value}, but got {payload[key]}"
+            differences = _compare_nested_dicts(
+                payload, expected_payload, ignore_keys=ignored_keys
+            )
+            if differences:
+                assert False, f"Dictionary mismatch: {differences}"
+
+
+def _compare_nested_dicts(
+    actual: dict, expected: dict, path: str = "", ignore_keys: list[str] = []
+) -> list[str]:
+    """Compare nested dictionaries and return a list of differences in a human-friendly format."""
+    differences = []
+
+    # Check if current path should be ignored
+    if path in ignore_keys:
+        return differences
+
+    # Check for keys in actual but not in expected
+    for key in actual.keys():
+        current_path = f"{path}.{key}" if path else key
+        if current_path not in ignore_keys and key not in expected:
+            differences.append(f"Extra key in actual: {current_path}")
+
+    for key, expected_value in expected.items():
+        current_path = f"{path}.{key}" if path else key
+        if current_path in ignore_keys:
+            continue
+        if key not in actual:
+            differences.append(f"Missing key: {current_path}")
+            continue
+
+        actual_value = actual[key]
+
+        # Try to parse JSON strings
+        if isinstance(expected_value, str):
+            try:
+                expected_value = json.loads(expected_value)
+            except json.JSONDecodeError:
+                pass
+        if isinstance(actual_value, str):
+            try:
+                actual_value = json.loads(actual_value)
+            except json.JSONDecodeError:
+                pass
+
+        if isinstance(expected_value, dict) and isinstance(actual_value, dict):
+            differences.extend(
+                _compare_nested_dicts(
+                    actual_value, expected_value, current_path, ignore_keys
+                )
+            )
+        elif isinstance(expected_value, dict) or isinstance(actual_value, dict):
+            differences.append(
+                f"Type mismatch at {current_path}: expected dict, got {type(actual_value).__name__}"
+            )
+        else:
+            # For non-dict values, only report if they're different
+            if actual_value != expected_value:
+                # Format the values to be more readable
+                actual_str = str(actual_value)
+                expected_str = str(expected_value)
+                if len(actual_str) > 50 or len(expected_str) > 50:
+                    actual_str = f"{actual_str[:50]}..."
+                    expected_str = f"{expected_str[:50]}..."
+                differences.append(
+                    f"Value mismatch at {current_path}:\n  expected: {expected_str}\n  got:      {actual_str}"
+                )
+    return differences
diff --git a/tests/llm_translation/base_llm_unit_tests.py b/tests/llm_translation/base_llm_unit_tests.py
index bd3627f7d4..725ebfba59 100644
--- a/tests/llm_translation/base_llm_unit_tests.py
+++ b/tests/llm_translation/base_llm_unit_tests.py
@@ -76,6 +76,11 @@ class BaseLLMChatTest(ABC):
         """Must return the base completion call args"""
         pass
 
+
+    def get_base_completion_call_args_with_reasoning_model(self) -> dict:
+        """Must return the base completion call args with reasoning_effort"""
+        return {}
+
     def test_developer_role_translation(self):
         """
         Test that the developer role is translated correctly for non-OpenAI providers.
@@ -1126,6 +1131,46 @@ class BaseLLMChatTest(ABC):
 
         print(response)
 
+    def test_reasoning_effort(self):
+        """Test that reasoning_effort is passed correctly to the model"""
+        from litellm.utils import supports_reasoning
+        from litellm import completion
+
+        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
+        litellm.model_cost = litellm.get_model_cost_map(url="")
+
+        base_completion_call_args = self.get_base_completion_call_args_with_reasoning_model()
+        if len(base_completion_call_args) == 0:
+            print("base_completion_call_args is empty")
+            pytest.skip("Model does not support reasoning")
+        if not supports_reasoning(base_completion_call_args["model"], None):
+            print("Model does not support reasoning")
+            pytest.skip("Model does not support reasoning")
+
+        _, provider, _, _ = litellm.get_llm_provider(
+            model=base_completion_call_args["model"]
+        )
+
+        ## CHECK PARAM MAPPING 
+        optional_params = get_optional_params(
+            model=base_completion_call_args["model"],
+            custom_llm_provider=provider,
+            reasoning_effort="high",
+        )
+        # either accepts reasoning effort or thinking budget
+        assert "reasoning_effort" in optional_params or "4096" in json.dumps(optional_params)
+
+        try:
+            litellm._turn_on_debug()
+            response = completion(
+                **base_completion_call_args,
+                reasoning_effort="low",
+                messages=[{"role": "user", "content": "Hello!"}],
+            )
+            print(f"response: {response}")
+        except Exception as e:
+            pytest.fail(f"Error: {e}")
+
         
 
 class BaseOSeriesModelsTest(ABC):  # test across azure/openai
diff --git a/tests/llm_translation/test_gemini.py b/tests/llm_translation/test_gemini.py
index 9e6105e39a..c8cc0366d4 100644
--- a/tests/llm_translation/test_gemini.py
+++ b/tests/llm_translation/test_gemini.py
@@ -17,6 +17,9 @@ from litellm import completion
 class TestGoogleAIStudioGemini(BaseLLMChatTest):
     def get_base_completion_call_args(self) -> dict:
         return {"model": "gemini/gemini-2.0-flash"}
+    
+    def get_base_completion_call_args_with_reasoning_model(self) -> dict:
+        return {"model": "gemini/gemini-2.5-flash-preview-04-17"}
 
     def test_tool_call_no_arguments(self, tool_call_no_arguments):
         """Test that tool calls with no arguments is translated correctly. Relevant issue: https://github.com/BerriAI/litellm/issues/6833"""
@@ -85,3 +88,4 @@ def test_gemini_image_generation():
     assert response.choices[0].message.content is not None
 
 
+