anthropic prompt caching cost tracking (#5453)

* fix(utils.py): support 'drop_params' for embedding requests Fixes https://github.com/BerriAI/litellm/issues/5444 * feat(anthropic/cost_calculation.py): Support calculating cost for prompt caching on anthropic * feat(types/utils.py): allows us to migrate to openai's equivalent, once that comes out * fix: fix linting errors * test: mark flaky test
2024-08-31 14:09:35 -07:00 · 2024-08-31 14:09:35 -07:00 · 65a9c933ad
commit 65a9c933ad
parent e6faaba56e
17 changed files with 432 additions and 84 deletions
--- a/litellm/init.py
+++ b/litellm/init.py
@ -841,10 +841,10 @@ ALL_LITELLM_RESPONSE_TYPES = [
 from .types.utils import ImageObject
 from .llms.custom_llm import CustomLLM
 from .llms.huggingface_restapi import HuggingfaceConfig
-from .llms.anthropic import AnthropicConfig
+from .llms.anthropic.chat import AnthropicConfig
 from .llms.anthropic.completion import AnthropicTextConfig
 from .llms.databricks import DatabricksConfig, DatabricksEmbeddingConfig
 from .llms.predibase import PredibaseConfig
 from .llms.anthropic_text import AnthropicTextConfig
 from .llms.replicate import ReplicateConfig
 from .llms.cohere.completion import CohereConfig
 from .llms.clarifai import ClarifaiConfig
--- a/litellm/cost_calculator.py
+++ b/litellm/cost_calculator.py
@ -19,8 +19,12 @@ from litellm.litellm_core_utils.llm_cost_calc.google import (
    cost_router as google_cost_router,
 )
 from litellm.litellm_core_utils.llm_cost_calc.utils import _generic_cost_per_character
 from litellm.llms.anthropic.cost_calculation import (
    cost_per_token as anthropic_cost_per_token,
 )
 from litellm.types.llms.openai import HttpxBinaryResponseContent
 from litellm.types.router import SPECIAL_MODEL_INFO_PARAMS
 from litellm.types.utils import Usage
 from litellm.utils import (
    CallTypes,
    CostPerToken,
@ -59,14 +63,17 @@ def _cost_per_token_custom_pricing_helper(
 def cost_per_token(
    model: str = "",
-    prompt_tokens: float = 0,
+    prompt_tokens: int = 0,
-    completion_tokens: float = 0,
+    completion_tokens: int = 0,
    response_time_ms=None,
    custom_llm_provider: Optional[str] = None,
    region_name=None,
    ### CHARACTER PRICING ###
-    prompt_characters: float = 0,
+    prompt_characters: int = 0,
-    completion_characters: float = 0,
+    completion_characters: int = 0,
    ### PROMPT CACHING PRICING ### - used for anthropic
    cache_creation_input_tokens: Optional[int] = 0,
    cache_read_input_tokens: Optional[int] = 0,
    ### CUSTOM PRICING ###
    custom_cost_per_token: Optional[CostPerToken] = None,
    custom_cost_per_second: Optional[float] = None,
@ -108,6 +115,16 @@ def cost_per_token(
    """
    if model is None:
        raise Exception("Invalid arg. Model cannot be none.")
    ## RECONSTRUCT USAGE BLOCK ##
    usage_block = Usage(
        prompt_tokens=prompt_tokens,
        completion_tokens=completion_tokens,
        total_tokens=prompt_tokens + completion_tokens,
        cache_creation_input_tokens=cache_creation_input_tokens,
        cache_read_input_tokens=cache_read_input_tokens,
    )
    ## CUSTOM PRICING ##
    response_cost = _cost_per_token_custom_pricing_helper(
        prompt_tokens=prompt_tokens,
@ -137,6 +154,7 @@ def cost_per_token(
                model_with_provider = model_with_provider_and_region
    else:
        _, custom_llm_provider, _, _ = litellm.get_llm_provider(model=model)
    model_without_prefix = model
    model_parts = model.split("/")
    if len(model_parts) > 1:
@ -162,6 +180,7 @@ def cost_per_token(
    # see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models
    print_verbose(f"Looking up model={model} in model_cost_map")
    if custom_llm_provider == "vertex_ai":
        cost_router = google_cost_router(
            model=model_without_prefix,
@ -188,6 +207,8 @@ def cost_per_token(
                prompt_tokens=prompt_tokens,
                completion_tokens=completion_tokens,
            )
    elif custom_llm_provider == "anthropic":
        return anthropic_cost_per_token(model=model, usage=usage_block)
    elif custom_llm_provider == "gemini":
        return google_cost_per_token(
            model=model_without_prefix,
@ -520,6 +541,8 @@ def completion_cost(
        prompt_characters = 0
        completion_tokens = 0
        completion_characters = 0
        cache_creation_input_tokens: Optional[int] = None
        cache_read_input_tokens: Optional[int] = None
        if completion_response is not None and (
            isinstance(completion_response, BaseModel)
            or isinstance(completion_response, dict)
@ -541,6 +564,13 @@ def completion_cost(
            completion_tokens = completion_response.get("usage", {}).get(
                "completion_tokens", 0
            )
            cache_creation_input_tokens = completion_response.get("usage", {}).get(
                "cache_creation_input_tokens", 0
            )
            cache_read_input_tokens = completion_response.get("usage", {}).get(
                "cache_read_input_tokens", 0
            )
            total_time = getattr(completion_response, "_response_ms", 0)
            verbose_logger.debug(
                f"completion_response response ms: {getattr(completion_response, '_response_ms', None)} "
@ -550,7 +580,7 @@ def completion_cost(
            )
            if hasattr(completion_response, "_hidden_params"):
                custom_llm_provider = completion_response._hidden_params.get(
-                    "custom_llm_provider", custom_llm_provider or ""
+                    "custom_llm_provider", custom_llm_provider or None
                )
                region_name = completion_response._hidden_params.get(
                    "region_name", region_name
@ -697,6 +727,8 @@ def completion_cost(
            custom_cost_per_token=custom_cost_per_token,
            prompt_characters=prompt_characters,
            completion_characters=completion_characters,
            cache_creation_input_tokens=cache_creation_input_tokens,
            cache_read_input_tokens=cache_read_input_tokens,
            call_type=call_type,
        )
        _final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
--- a/litellm/llms/anthropic/chat.py
+++ b/litellm/llms/anthropic/chat.py
@ -1,3 +1,7 @@
 """
 Calling + translation logic for anthropic's `/v1/messages` endpoint
 """
 import copy
 import json
 import os
@ -70,8 +74,8 @@ from litellm.types.llms.openai import (
 from litellm.types.utils import Choices, GenericStreamingChunk
 from litellm.utils import CustomStreamWrapper, ModelResponse, Usage
-from .base import BaseLLM
+from ..base import BaseLLM
-from .prompt_templates.factory import custom_prompt, prompt_factory
+from ..prompt_templates.factory import custom_prompt, prompt_factory
 class AnthropicConstants(Enum):
@ -982,7 +986,7 @@ class AnthropicChatCompletion(BaseLLM):
                )
            except Exception as e:
                verbose_logger.exception(
-                    "litellm.llms.anthropic.py::completion() - Exception occurred - {}\nReceived Messages: {}".format(
+                    "litellm.llms.anthropic.chat.py::completion() - Exception occurred - {}\nReceived Messages: {}".format(
                        str(e), messages
                    )
                )
--- a/litellm/llms/anthropic/completion.py
+++ b/litellm/llms/anthropic/completion.py
@ -1,3 +1,7 @@
 """
 Translation logic for anthropic's `/v1/complete` endpoint
 """
 import json
 import os
 import time
@ -12,8 +16,8 @@ import litellm
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.utils import CustomStreamWrapper, ModelResponse, Usage
-from .base import BaseLLM
+from ..base import BaseLLM
-from .prompt_templates.factory import custom_prompt, prompt_factory
+from ..prompt_templates.factory import custom_prompt, prompt_factory
 class AnthropicConstants(Enum):
--- a/litellm/llms/anthropic/cost_calculation.py
+++ b/litellm/llms/anthropic/cost_calculation.py
@ -0,0 +1,42 @@
 """
 Helper util for handling anthropic-specific cost calculation
 - e.g.: prompt caching
 """
 from typing import Tuple
 from litellm.types.utils import Usage
 from litellm.utils import get_model_info
 def cost_per_token(model: str, usage: Usage) -> Tuple[float, float]:
    """
    Calculates the cost per token for a given model, prompt tokens, and completion tokens.
    Input:
        - model: str, the model name without provider prefix
        - usage: LiteLLM Usage block, containing anthropic caching information
    Returns:
        Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
    """
    ## GET MODEL INFO
    model_info = get_model_info(model=model, custom_llm_provider="anthropic")
    ## CALCULATE INPUT COST
    prompt_cost: float = usage["prompt_tokens"] * model_info["input_cost_per_token"]
    if model_info.get("cache_creation_input_token_cost") is not None:
        prompt_cost += (
            usage._cache_creation_input_tokens  # type: ignore
            * model_info["cache_creation_input_token_cost"]
        )
    if model_info.get("cache_read_input_token_cost") is not None:
        prompt_cost += (
            usage._cache_read_input_tokens * model_info["cache_read_input_token_cost"]  # type: ignore
        )
    ## CALCULATE OUTPUT COST
    completion_cost = usage["completion_tokens"] * model_info["output_cost_per_token"]
    return prompt_cost, completion_cost
--- a/litellm/llms/base.py
+++ b/litellm/llms/base.py
@ -1,11 +1,14 @@
 ## This is a template base class to be used for adding new LLM providers via API calls
 from typing import Any, Optional, Union
 import httpx
 import requests
 import litellm
 import httpx, requests
 from typing import Optional, Union
 from litellm.litellm_core_utils.litellm_logging import Logging
 class BaseLLM:
    _client_session: Optional[httpx.Client] = None
    def process_response(
@ -14,7 +17,7 @@ class BaseLLM:
        response: Union[requests.Response, httpx.Response],
        model_response: litellm.utils.ModelResponse,
        stream: bool,
-        logging_obj: Logging,
+        logging_obj: Any,
        optional_params: dict,
        api_key: str,
        data: Union[dict, str],
@ -33,7 +36,7 @@ class BaseLLM:
        response: Union[requests.Response, httpx.Response],
        model_response: litellm.utils.TextCompletionResponse,
        stream: bool,
-        logging_obj: Logging,
+        logging_obj: Any,
        optional_params: dict,
        api_key: str,
        data: Union[dict, str],
--- a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_anthropic.py
+++ b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_anthropic.py
@ -267,18 +267,19 @@ def completion(
 ):
    try:
        import vertexai
        from anthropic import AnthropicVertex
        from litellm.llms.anthropic import AnthropicChatCompletion
        from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import (
            VertexLLM,
        )
    except:
        raise VertexAIError(
            status_code=400,
            message="""vertexai import failed please run `pip install -U google-cloud-aiplatform "anthropic[vertex]"`""",
        )
    from anthropic import AnthropicVertex
    from litellm.llms.anthropic.chat import AnthropicChatCompletion
    from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import (
        VertexLLM,
    )
    if not (
        hasattr(vertexai, "preview") or hasattr(vertexai.preview, "language_models")
    ):
--- a/litellm/main.py
+++ b/litellm/main.py
@ -77,13 +77,10 @@ from .caching import disable_cache, enable_cache, update_cache
 from .llms import (
    ai21,
    aleph_alpha,
    anthropic_text,
    baseten,
    bedrock,
    clarifai,
    cloudflare,
    gemini,
    huggingface_restapi,
    maritalk,
    nlp_cloud,
    ollama,
@ -93,13 +90,10 @@ from .llms import (
    palm,
    petals,
    replicate,
    together_ai,
    triton,
    vllm,
    watsonx,
 )
-from .llms.anthropic import AnthropicChatCompletion
+from .llms.anthropic.chat import AnthropicChatCompletion
-from .llms.anthropic_text import AnthropicTextCompletion
+from .llms.anthropic.completion import AnthropicTextCompletion
 from .llms.azure import AzureChatCompletion, _check_dynamic_azure_params
 from .llms.azure_text import AzureTextCompletion
 from .llms.bedrock_httpx import BedrockConverseLLM, BedrockLLM
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -1336,6 +1336,8 @@
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.00000025,
        "output_cost_per_token": 0.00000125,
        "cache_creation_input_token_cost": 0.0000003,
        "cache_read_input_token_cost": 0.00000003,
        "litellm_provider": "anthropic",
        "mode": "chat",
        "supports_function_calling": true,
@ -1349,6 +1351,8 @@
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.000015,
        "output_cost_per_token": 0.000075,
        "cache_creation_input_token_cost": 0.00001875,
        "cache_read_input_token_cost": 0.0000015,
        "litellm_provider": "anthropic",
        "mode": "chat",
        "supports_function_calling": true,
@ -1375,6 +1379,8 @@
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000015,
        "cache_creation_input_token_cost": 0.00000375,
        "cache_read_input_token_cost": 0.0000003,
        "litellm_provider": "anthropic",
        "mode": "chat",
        "supports_function_calling": true,
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@ -1,4 +1,4 @@
 model_list:
-  - model_name: "gemini/*"
+  - model_name: "gpt-3.5-turbo"
    litellm_params:
-      model: "gemini/*"
+      model: "gpt-3.5-turbo"
--- a/litellm/tests/test_anthropic_completion.py
+++ b/litellm/tests/test_anthropic_completion.py
@ -10,7 +10,7 @@ from dotenv import load_dotenv
 import litellm.types
 import litellm.types.utils
-from litellm.llms.anthropic import ModelResponseIterator
+from litellm.llms.anthropic.chat import ModelResponseIterator
 load_dotenv()
 import io
@ -152,48 +152,190 @@ def test_anthropic_completion_e2e(stream):
 anthropic_chunk_list = [
-    {"type": "content_block_start", "index": 0, "content_block": {"type": "text", "text": ""}},
+    {
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": "To"}},
+        "type": "content_block_start",
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " answer"}},
+        "index": 0,
-    {"type": "content_block_delta", "index": 0,
+        "content_block": {"type": "text", "text": ""},
-     "delta": {"type": "text_delta", "text": " your question about the weather"}},
+    },
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " in Boston and Los"}},
+    {
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " Angeles today, I'll"}},
+        "type": "content_block_delta",
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " need to"}},
+        "index": 0,
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " use"}},
+        "delta": {"type": "text_delta", "text": "To"},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " the"}},
+    },
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " get_current_weather"}},
+    {
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " function"}},
+        "type": "content_block_delta",
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " for"}},
+        "index": 0,
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " both"}},
+        "delta": {"type": "text_delta", "text": " answer"},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " cities"}},
+    },
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": ". Let"}},
+    {
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " me fetch"}},
+        "type": "content_block_delta",
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " that"}},
+        "index": 0,
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " information"}},
+        "delta": {"type": "text_delta", "text": " your question about the weather"},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " for"}},
+    },
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " you."}},
+    {
        "type": "content_block_delta",
        "index": 0,
        "delta": {"type": "text_delta", "text": " in Boston and Los"},
    },
    {
        "type": "content_block_delta",
        "index": 0,
        "delta": {"type": "text_delta", "text": " Angeles today, I'll"},
    },
    {
        "type": "content_block_delta",
        "index": 0,
        "delta": {"type": "text_delta", "text": " need to"},
    },
    {
        "type": "content_block_delta",
        "index": 0,
        "delta": {"type": "text_delta", "text": " use"},
    },
    {
        "type": "content_block_delta",
        "index": 0,
        "delta": {"type": "text_delta", "text": " the"},
    },
    {
        "type": "content_block_delta",
        "index": 0,
        "delta": {"type": "text_delta", "text": " get_current_weather"},
    },
    {
        "type": "content_block_delta",
        "index": 0,
        "delta": {"type": "text_delta", "text": " function"},
    },
    {
        "type": "content_block_delta",
        "index": 0,
        "delta": {"type": "text_delta", "text": " for"},
    },
    {
        "type": "content_block_delta",
        "index": 0,
        "delta": {"type": "text_delta", "text": " both"},
    },
    {
        "type": "content_block_delta",
        "index": 0,
        "delta": {"type": "text_delta", "text": " cities"},
    },
    {
        "type": "content_block_delta",
        "index": 0,
        "delta": {"type": "text_delta", "text": ". Let"},
    },
    {
        "type": "content_block_delta",
        "index": 0,
        "delta": {"type": "text_delta", "text": " me fetch"},
    },
    {
        "type": "content_block_delta",
        "index": 0,
        "delta": {"type": "text_delta", "text": " that"},
    },
    {
        "type": "content_block_delta",
        "index": 0,
        "delta": {"type": "text_delta", "text": " information"},
    },
    {
        "type": "content_block_delta",
        "index": 0,
        "delta": {"type": "text_delta", "text": " for"},
    },
    {
        "type": "content_block_delta",
        "index": 0,
        "delta": {"type": "text_delta", "text": " you."},
    },
    {"type": "content_block_stop", "index": 0},
-    {"type": "content_block_start", "index": 1,
+    {
-     "content_block": {"type": "tool_use", "id": "toolu_12345", "name": "get_current_weather", "input": {}}},
+        "type": "content_block_start",
-    {"type": "content_block_delta", "index": 1, "delta": {"type": "input_json_delta", "partial_json": ""}},
+        "index": 1,
-    {"type": "content_block_delta", "index": 1, "delta": {"type": "input_json_delta", "partial_json": "{\"locat"}},
+        "content_block": {
-    {"type": "content_block_delta", "index": 1, "delta": {"type": "input_json_delta", "partial_json": "ion\": \"Bos"}},
+            "type": "tool_use",
-    {"type": "content_block_delta", "index": 1, "delta": {"type": "input_json_delta", "partial_json": "ton, MA\"}"}},
+            "id": "toolu_12345",
            "name": "get_current_weather",
            "input": {},
        },
    },
    {
        "type": "content_block_delta",
        "index": 1,
        "delta": {"type": "input_json_delta", "partial_json": ""},
    },
    {
        "type": "content_block_delta",
        "index": 1,
        "delta": {"type": "input_json_delta", "partial_json": '{"locat'},
    },
    {
        "type": "content_block_delta",
        "index": 1,
        "delta": {"type": "input_json_delta", "partial_json": 'ion": "Bos'},
    },
    {
        "type": "content_block_delta",
        "index": 1,
        "delta": {"type": "input_json_delta", "partial_json": 'ton, MA"}'},
    },
    {"type": "content_block_stop", "index": 1},
-    {"type": "content_block_start", "index": 2,
+    {
-     "content_block": {"type": "tool_use", "id": "toolu_023423423", "name": "get_current_weather", "input": {}}},
+        "type": "content_block_start",
-    {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": ""}},
+        "index": 2,
-    {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "{\"l"}},
+        "content_block": {
-    {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "oca"}},
+            "type": "tool_use",
-    {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "tio"}},
+            "id": "toolu_023423423",
-    {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "n\": \"Lo"}},
+            "name": "get_current_weather",
-    {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "s Angel"}},
+            "input": {},
-    {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "es, CA\"}"}},
+        },
    },
    {
        "type": "content_block_delta",
        "index": 2,
        "delta": {"type": "input_json_delta", "partial_json": ""},
    },
    {
        "type": "content_block_delta",
        "index": 2,
        "delta": {"type": "input_json_delta", "partial_json": '{"l'},
    },
    {
        "type": "content_block_delta",
        "index": 2,
        "delta": {"type": "input_json_delta", "partial_json": "oca"},
    },
    {
        "type": "content_block_delta",
        "index": 2,
        "delta": {"type": "input_json_delta", "partial_json": "tio"},
    },
    {
        "type": "content_block_delta",
        "index": 2,
        "delta": {"type": "input_json_delta", "partial_json": 'n": "Lo'},
    },
    {
        "type": "content_block_delta",
        "index": 2,
        "delta": {"type": "input_json_delta", "partial_json": "s Angel"},
    },
    {
        "type": "content_block_delta",
        "index": 2,
        "delta": {"type": "input_json_delta", "partial_json": 'es, CA"}'},
    },
    {"type": "content_block_stop", "index": 2},
-    {"type": "message_delta", "delta": {"stop_reason": "tool_use", "stop_sequence": None},
+    {
-     "usage": {"output_tokens": 137}},
+        "type": "message_delta",
-    {"type": "message_stop"}
+        "delta": {"stop_reason": "tool_use", "stop_sequence": None},
        "usage": {"output_tokens": 137},
    },
    {"type": "message_stop"},
 ]
@ -211,12 +353,12 @@ def test_anthropic_tool_streaming():
    correct_tool_index = -1
    for chunk in anthropic_chunk_list:
        parsed_chunk = response_iter.chunk_parser(chunk)
-        if tool_use := parsed_chunk.get('tool_use'):
+        if tool_use := parsed_chunk.get("tool_use"):
            # We only increment when a new block starts
-            if tool_use.get('id') is not None:
+            if tool_use.get("id") is not None:
                correct_tool_index += 1
-            assert tool_use['index'] == correct_tool_index
+            assert tool_use["index"] == correct_tool_index
@pytest.mark.asyncio
@ -344,4 +486,4 @@ def test_anthropic_tool_calling_translation():
    print(translated_params["messages"])
    assert len(translated_params["messages"]) > 0
-    assert translated_params["messages"][0]["role"] == "user"
+    assert translated_params["messages"][0]["role"] == "user"
--- a/litellm/tests/test_completion_cost.py
+++ b/litellm/tests/test_completion_cost.py
@ -1097,3 +1097,73 @@ def test_completion_cost_azure_common_deployment_name():
        print(f"mock_client.call_args: {mock_client.call_args.kwargs}")
        assert "azure/gpt-4" == mock_client.call_args.kwargs["model"]
 def test_completion_cost_anthropic_prompt_caching():
    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    from litellm.utils import Choices, Message, ModelResponse, Usage
    model = "anthropic/claude-3-5-sonnet-20240620"
    ## WRITE TO CACHE ## (MORE EXPENSIVE)
    response_1 = ModelResponse(
        id="chatcmpl-3f427194-0840-4d08-b571-56bfe38a5424",
        choices=[
            Choices(
                finish_reason="length",
                index=0,
                message=Message(
                    content="Hello! I'm doing well, thank you for",
                    role="assistant",
                    tool_calls=None,
                    function_call=None,
                ),
            )
        ],
        created=1725036547,
        model="claude-3-5-sonnet-20240620",
        object="chat.completion",
        system_fingerprint=None,
        usage=Usage(
            completion_tokens=10,
            prompt_tokens=14,
            total_tokens=24,
            cache_creation_input_tokens=100,
            cache_read_input_tokens=0,
        ),
    )
    ## READ FROM CACHE ## (LESS EXPENSIVE)
    response_2 = ModelResponse(
        id="chatcmpl-3f427194-0840-4d08-b571-56bfe38a5424",
        choices=[
            Choices(
                finish_reason="length",
                index=0,
                message=Message(
                    content="Hello! I'm doing well, thank you for",
                    role="assistant",
                    tool_calls=None,
                    function_call=None,
                ),
            )
        ],
        created=1725036547,
        model="claude-3-5-sonnet-20240620",
        object="chat.completion",
        system_fingerprint=None,
        usage=Usage(
            completion_tokens=10,
            prompt_tokens=14,
            total_tokens=24,
            cache_creation_input_tokens=0,
            cache_read_input_tokens=100,
        ),
    )
    cost_1 = completion_cost(model=model, completion_response=response_1)
    cost_2 = completion_cost(model=model, completion_response=response_2)
    assert cost_1 > cost_2
--- a/litellm/tests/test_dynamic_rate_limit_handler.py
+++ b/litellm/tests/test_dynamic_rate_limit_handler.py
@ -290,6 +290,7 @@ async def test_base_case(dynamic_rate_limit_handler, mock_response):
@pytest.mark.asyncio
@pytest.mark.flaky(retries=3, delay=1)
 async def test_update_cache(
    dynamic_rate_limit_handler, mock_response, user_api_key_auth
 ):
--- a/litellm/tests/test_optional_params.py
+++ b/litellm/tests/test_optional_params.py
@ -75,6 +75,16 @@ def test_bedrock_optional_params_embeddings():
    assert len(optional_params) == 0
 def test_google_ai_studio_optional_params_embeddings():
    optional_params = get_optional_params_embeddings(
        user="John",
        encoding_format=None,
        custom_llm_provider="gemini",
        drop_params=True,
    )
    assert len(optional_params) == 0
 def test_openai_optional_params_embeddings():
    litellm.drop_params = True
    optional_params = get_optional_params_embeddings(
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@ -51,6 +51,8 @@ class ModelInfo(TypedDict, total=False):
    max_input_tokens: Required[Optional[int]]
    max_output_tokens: Required[Optional[int]]
    input_cost_per_token: Required[float]
    cache_creation_input_token_cost: Optional[float]
    cache_read_input_token_cost: Optional[float]
    input_cost_per_character: Optional[float]  # only for vertex ai models
    input_cost_per_token_above_128k_tokens: Optional[float]  # only for vertex ai models
    input_cost_per_character_above_128k_tokens: Optional[
@ -454,6 +456,13 @@ class Choices(OpenAIObject):
 class Usage(CompletionUsage):
    _cache_creation_input_tokens: int = PrivateAttr(
        0
    )  # hidden param for prompt caching. Might change, once openai introduces their equivalent.
    _cache_read_input_tokens: int = PrivateAttr(
        0
    )  # hidden param for prompt caching. Might change, once openai introduces their equivalent.
    def __init__(
        self,
        prompt_tokens: Optional[int] = None,
@ -466,9 +475,18 @@ class Usage(CompletionUsage):
            "completion_tokens": completion_tokens or 0,
            "total_tokens": total_tokens or 0,
        }
        super().__init__(**data)
        if "cache_creation_input_tokens" in params and isinstance(
            params["cache_creation_input_tokens"], int
        ):
            self._cache_creation_input_tokens = params["cache_creation_input_tokens"]
        if "cache_read_input_tokens" in params and isinstance(
            params["cache_read_input_tokens"], int
        ):
            self._cache_read_input_tokens = params["cache_read_input_tokens"]
        for k, v in params.items():
            setattr(self, k, v)
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -2550,6 +2550,7 @@ def get_optional_params_embeddings(
    encoding_format=None,
    dimensions=None,
    custom_llm_provider="",
    drop_params: Optional[bool] = None,
    additional_drop_params: Optional[bool] = None,
    **kwargs,
 ):
@ -2560,6 +2561,7 @@ def get_optional_params_embeddings(
    for k, v in special_params.items():
        passed_params[k] = v
    drop_params = passed_params.pop("drop_params", None)
    additional_drop_params = passed_params.pop("additional_drop_params", None)
    default_params = {"user": None, "encoding_format": None, "dimensions": None}
@ -2571,11 +2573,16 @@ def get_optional_params_embeddings(
        for k in non_default_params.keys():
            if k not in supported_params:
                unsupported_params[k] = non_default_params[k]
-        if unsupported_params and not litellm.drop_params:
+        if unsupported_params:
-            raise UnsupportedParamsError(
+            if litellm.drop_params is True or (
-                status_code=500,
+                drop_params is not None and drop_params is True
-                message=f"{custom_llm_provider} does not support parameters: {unsupported_params}, for model={model}. To drop these, set `litellm.drop_params=True` or for proxy:\n\n`litellm_settings:\n drop_params: true`\n",
+            ):
-            )
+                pass
            else:
                raise UnsupportedParamsError(
                    status_code=500,
                    message=f"{custom_llm_provider} does not support parameters: {unsupported_params}, for model={model}. To drop these, set `litellm.drop_params=True` or for proxy:\n\n`litellm_settings:\n drop_params: true`\n",
                )
    non_default_params = _get_non_default_params(
        passed_params=passed_params,
@ -2680,7 +2687,9 @@ def get_optional_params_embeddings(
        and custom_llm_provider not in litellm.openai_compatible_providers
    ):
        if len(non_default_params.keys()) > 0:
-            if litellm.drop_params is True:  # drop the unsupported non-default values
+            if (
                litellm.drop_params is True or drop_params is True
            ):  # drop the unsupported non-default values
                keys = list(non_default_params.keys())
                for k in keys:
                    non_default_params.pop(k, None)
@ -5335,6 +5344,12 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
                max_input_tokens=_model_info.get("max_input_tokens", None),
                max_output_tokens=_model_info.get("max_output_tokens", None),
                input_cost_per_token=_input_cost_per_token,
                cache_creation_input_token_cost=_model_info.get(
                    "cache_creation_input_token_cost", None
                ),
                cache_read_input_token_cost=_model_info.get(
                    "cache_read_input_token_cost", None
                ),
                input_cost_per_character=_model_info.get(
                    "input_cost_per_character", None
                ),
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -1336,6 +1336,8 @@
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.00000025,
        "output_cost_per_token": 0.00000125,
        "cache_creation_input_token_cost": 0.0000003,
        "cache_read_input_token_cost": 0.00000003,
        "litellm_provider": "anthropic",
        "mode": "chat",
        "supports_function_calling": true,
@ -1349,6 +1351,8 @@
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.000015,
        "output_cost_per_token": 0.000075,
        "cache_creation_input_token_cost": 0.00001875,
        "cache_read_input_token_cost": 0.0000015,
        "litellm_provider": "anthropic",
        "mode": "chat",
        "supports_function_calling": true,
@ -1375,6 +1379,8 @@
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000015,
        "cache_creation_input_token_cost": 0.00000375,
        "cache_read_input_token_cost": 0.0000003,
        "litellm_provider": "anthropic",
        "mode": "chat",
        "supports_function_calling": true,