Merge e546e5f147 into 104e4cb1bc

2025-04-24 18:24:20 +00:00 · 2025-04-21 20:36:38 -07:00 · 2025-04-21 20:36:38 -07:00 · cbab62f586
commit cbab62f586
parent 104e4cb1bc e546e5f147
8 changed files with 142 additions and 19 deletions
--- a/litellm/litellm_core_utils/llm_cost_calc/utils.py
+++ b/litellm/litellm_core_utils/llm_cost_calc/utils.py
@ -128,6 +128,10 @@ def _get_token_base_cost(model_info: ModelInfo, usage: Usage) -> Tuple[float, fl
            except Exception:
                continue

+    output_cost_per_token_thinking = model_info.get("output_cost_per_token_thinking")
+    if usage.get("thinking_enabled") and output_cost_per_token_thinking is not None:
+        completion_base_cost = output_cost_per_token_thinking
+
    return prompt_base_cost, completion_base_cost


--- a/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py
+++ b/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py
@ -365,17 +365,14 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
        if reasoning_effort == "low":
            return {
                "thinkingBudget": DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET,
-                "includeThoughts": True,
            }
        elif reasoning_effort == "medium":
            return {
                "thinkingBudget": DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET,
-                "includeThoughts": True,
            }
        elif reasoning_effort == "high":
            return {
                "thinkingBudget": DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET,
-                "includeThoughts": True,
            }
        else:
            raise ValueError(f"Invalid reasoning effort: {reasoning_effort}")
@ -388,9 +385,9 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
        thinking_budget = thinking_param.get("budget_tokens")

        params: GeminiThinkingConfig = {}
-        if thinking_enabled:
-            params["includeThoughts"] = True
-        if thinking_budget:
+        if not thinking_enabled:
+            params["thinkingBudget"] = 0
+        elif thinking_budget is not None:
            params["thinkingBudget"] = thinking_budget

        return params
@ -743,6 +740,7 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
    def _calculate_usage(
        self,
        completion_response: GenerateContentResponseBody,
+        thinking_enabled: bool | None,
    ) -> Usage:
        cached_tokens: Optional[int] = None
        audio_tokens: Optional[int] = None
@ -768,17 +766,24 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
            audio_tokens=audio_tokens,
            text_tokens=text_tokens,
        )
+        completion_tokens = completion_response["usageMetadata"].get(
+            "candidatesTokenCount", 0
+        )
+        if reasoning_tokens:
+            # Usage(...) constructor expects that completion_tokens includes the reasoning_tokens.
+            # However the Vertex AI usage metadata does not include reasoning tokens in candidatesTokenCount.
+            # Reportedly, this is different from the Gemini API.
+            completion_tokens += reasoning_tokens
        ## GET USAGE ##
        usage = Usage(
            prompt_tokens=completion_response["usageMetadata"].get(
                "promptTokenCount", 0
            ),
-            completion_tokens=completion_response["usageMetadata"].get(
-                "candidatesTokenCount", 0
-            ),
+            completion_tokens=completion_tokens,
            total_tokens=completion_response["usageMetadata"].get("totalTokenCount", 0),
            prompt_tokens_details=prompt_tokens_details,
            reasoning_tokens=reasoning_tokens,
+            thinking_enabled=thinking_enabled,
        )

        return usage
@ -910,6 +915,16 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
                    completion_response=completion_response,
                )

+        thinking_enabled = None
+        if "gemini-2.5-flash" in model:
+            # Only Gemini 2.5 Flash can have its thinking disabled by setting the thinking budget to zero
+            thinking_budget = (
+                request_data.get("generationConfig", {})
+                .get("thinkingConfig", {})
+                .get("thinkingBudget")
+            )
+            thinking_enabled = thinking_budget != 0
+
        model_response.choices = []

        try:
@ -923,7 +938,10 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
                    _candidates, model_response, litellm_params
                )

-            usage = self._calculate_usage(completion_response=completion_response)
+            usage = self._calculate_usage(
+                completion_response=completion_response,
+                thinking_enabled=thinking_enabled,
+            )
            setattr(model_response, "usage", usage)

            ## ADD METADATA TO RESPONSE ##
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -5413,7 +5413,7 @@
        "input_cost_per_audio_token": 1e-6,
        "input_cost_per_token": 0.15e-6,
        "output_cost_per_token": 0.6e-6,
-        "output_cost_per_reasoning_token": 3.5e-6,
+        "output_cost_per_token_thinking": 3.5e-6,
        "litellm_provider": "gemini",
        "mode": "chat",
        "rpm": 10,
@ -5443,7 +5443,7 @@
        "input_cost_per_audio_token": 1e-6,
        "input_cost_per_token": 0.15e-6,
        "output_cost_per_token": 0.6e-6,
-        "output_cost_per_reasoning_token": 3.5e-6,
+        "output_cost_per_token_thinking": 3.5e-6,
        "litellm_provider": "vertex_ai-language-models",
        "mode": "chat",
        "supports_reasoning": true,
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@ -138,6 +138,9 @@ class ModelInfoBase(ProviderSpecificModelInfo, total=False):
    input_cost_per_token_batches: Optional[float]
    output_cost_per_token_batches: Optional[float]
    output_cost_per_token: Required[float]
+    output_cost_per_token_thinking: Optional[
+        float
+    ]  # only for vertex ai gemini-2.5-flash models
    output_cost_per_character: Optional[float]  # only for vertex ai models
    output_cost_per_audio_token: Optional[float]
    output_cost_per_token_above_128k_tokens: Optional[
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -4572,6 +4572,9 @@ def _get_model_info_helper(  # noqa: PLR0915
                    "output_cost_per_token_batches"
                ),
                output_cost_per_token=_output_cost_per_token,
+                output_cost_per_token_thinking=_model_info.get(
+                    "output_cost_per_token_thinking", None
+                ),
                output_cost_per_audio_token=_model_info.get(
                    "output_cost_per_audio_token", None
                ),
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -5413,7 +5413,7 @@
        "input_cost_per_audio_token": 1e-6,
        "input_cost_per_token": 0.15e-6,
        "output_cost_per_token": 0.6e-6,
-        "output_cost_per_reasoning_token": 3.5e-6,
+        "output_cost_per_token_thinking": 3.5e-6,
        "litellm_provider": "gemini",
        "mode": "chat",
        "rpm": 10,
@ -5443,7 +5443,7 @@
        "input_cost_per_audio_token": 1e-6,
        "input_cost_per_token": 0.15e-6,
        "output_cost_per_token": 0.6e-6,
-        "output_cost_per_reasoning_token": 3.5e-6,
+        "output_cost_per_token_thinking": 3.5e-6,
        "litellm_provider": "vertex_ai-language-models",
        "mode": "chat",
        "supports_reasoning": true,
--- a/tests/litellm/litellm_core_utils/llm_cost_calc/test_llm_cost_calc_utils.py
+++ b/tests/litellm/litellm_core_utils/llm_cost_calc/test_llm_cost_calc_utils.py
@ -87,6 +87,49 @@ def test_reasoning_tokens_gemini():
        prompt_tokens_details=PromptTokensDetailsWrapper(
            audio_tokens=None, cached_tokens=None, text_tokens=17, image_tokens=None
        ),
+        thinking_enabled=True,
+    )
+    model_cost_map = litellm.model_cost[model]
+    prompt_cost, completion_cost = generic_cost_per_token(
+        model=model,
+        usage=usage,
+        custom_llm_provider=custom_llm_provider,
+    )
+
+    assert round(prompt_cost, 10) == round(
+        model_cost_map["input_cost_per_token"] * usage.prompt_tokens,
+        10,
+    )
+    assert round(completion_cost, 10) == round(
+        (
+            model_cost_map["output_cost_per_token_thinking"]
+            * usage.completion_tokens
+        ),
+        10,
+    )
+
+
+def test_reasoning_disabled_tokens_gemini():
+    model = "gemini-2.5-flash-preview-04-17"
+    custom_llm_provider = "gemini"
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
+    litellm.model_cost = litellm.get_model_cost_map(url="")
+
+    usage = Usage(
+        completion_tokens=1578,
+        prompt_tokens=17,
+        total_tokens=1595,
+        completion_tokens_details=CompletionTokensDetailsWrapper(
+            accepted_prediction_tokens=None,
+            audio_tokens=None,
+            reasoning_tokens=None,
+            rejected_prediction_tokens=None,
+            text_tokens=1578,
+        ),
+        prompt_tokens_details=PromptTokensDetailsWrapper(
+            audio_tokens=None, cached_tokens=None, text_tokens=17, image_tokens=None
+        ),
+        thinking_enabled=False,
    )
    model_cost_map = litellm.model_cost[model]
    prompt_cost, completion_cost = generic_cost_per_token(
@ -102,11 +145,7 @@ def test_reasoning_tokens_gemini():
    assert round(completion_cost, 10) == round(
        (
            model_cost_map["output_cost_per_token"]
-            * usage.completion_tokens_details.text_tokens
-        )
-        + (
-            model_cost_map["output_cost_per_reasoning_token"]
-            * usage.completion_tokens_details.reasoning_tokens
+            * usage.completion_tokens
        ),
        10,
    )
--- a/tests/litellm/llms/vertex_ai/gemini/test_vertex_and_google_ai_studio_gemini.py
+++ b/tests/litellm/llms/vertex_ai/gemini/test_vertex_and_google_ai_studio_gemini.py
@ -259,3 +259,59 @@ def test_vertex_ai_empty_content():
    content, reasoning_content = v.get_assistant_content_message(parts=parts)
    assert content is None
    assert reasoning_content is None
+
+
+def test_vertex_ai_thinking_disabled():
+    from litellm.llms.vertex_ai.gemini.vertex_and_google_ai_studio_gemini import (
+        VertexGeminiConfig,
+    )
+    from litellm.types.llms.anthropic import AnthropicThinkingParam
+
+    v = VertexGeminiConfig()
+    optional_params = v.map_openai_params(
+        non_default_params={
+            "thinking": AnthropicThinkingParam(type="enabled", budget_tokens=0),
+        },
+        optional_params={},
+        model="gemini-2.5-flash-preview-04-17",
+        drop_params=False,
+    )
+    assert optional_params["thinkingConfig"]["thinkingBudget"] == 0
+
+    optional_params = v.map_openai_params(
+        non_default_params={
+            "thinking": AnthropicThinkingParam(type="enabled"),
+        },
+        optional_params={},
+        model="gemini-2.5-flash-preview-04-17",
+        drop_params=False,
+    )
+    assert "thinkingBudget" not in optional_params["thinkingConfig"]
+
+    optional_params = v.map_openai_params(
+        non_default_params={
+            "thinking": AnthropicThinkingParam(type="enabled", budget_tokens=1024),
+        },
+        optional_params={},
+        model="gemini-2.5-flash-preview-04-17",
+        drop_params=False,
+    )
+    assert optional_params["thinkingConfig"]["thinkingBudget"] == 1024
+
+    optional_params = v.map_openai_params(
+        non_default_params={
+            "thinking": cast(AnthropicThinkingParam, {"type": "invalid"}),
+        },
+        optional_params={},
+        model="gemini-2.5-flash-preview-04-17",
+        drop_params=False,
+    )
+    assert optional_params["thinkingConfig"]["thinkingBudget"] == 0
+
+    optional_params = v.map_openai_params(
+        non_default_params={},
+        optional_params={},
+        model="gemini-2.5-flash-preview-04-17",
+        drop_params=False,
+    )
+    assert "thinkingConfig" not in optional_params