anthropic prompt caching cost tracking (#5453)

* fix(utils.py): support 'drop_params' for embedding requests Fixes https://github.com/BerriAI/litellm/issues/5444 * feat(anthropic/cost_calculation.py): Support calculating cost for prompt caching on anthropic * feat(types/utils.py): allows us to migrate to openai's equivalent, once that comes out * fix: fix linting errors * test: mark flaky test
2024-08-31 14:09:35 -07:00 · 2024-08-31 14:09:35 -07:00 · 65a9c933ad
commit 65a9c933ad
parent e6faaba56e
17 changed files with 432 additions and 84 deletions
--- a/litellm/init.py
+++ b/litellm/init.py
@ -841,10 +841,10 @@ ALL_LITELLM_RESPONSE_TYPES = [
 from .types.utils import ImageObject
 from .llms.custom_llm import CustomLLM
 from .llms.huggingface_restapi import HuggingfaceConfig
-from .llms.anthropic import AnthropicConfig
+from .llms.anthropic.chat import AnthropicConfig
+from .llms.anthropic.completion import AnthropicTextConfig
 from .llms.databricks import DatabricksConfig, DatabricksEmbeddingConfig
 from .llms.predibase import PredibaseConfig
-from .llms.anthropic_text import AnthropicTextConfig
 from .llms.replicate import ReplicateConfig
 from .llms.cohere.completion import CohereConfig
 from .llms.clarifai import ClarifaiConfig
--- a/litellm/cost_calculator.py
+++ b/litellm/cost_calculator.py
@ -19,8 +19,12 @@ from litellm.litellm_core_utils.llm_cost_calc.google import (
    cost_router as google_cost_router,
 )
 from litellm.litellm_core_utils.llm_cost_calc.utils import _generic_cost_per_character
+from litellm.llms.anthropic.cost_calculation import (
+    cost_per_token as anthropic_cost_per_token,
+)
 from litellm.types.llms.openai import HttpxBinaryResponseContent
 from litellm.types.router import SPECIAL_MODEL_INFO_PARAMS
+from litellm.types.utils import Usage
 from litellm.utils import (
    CallTypes,
    CostPerToken,
@ -59,14 +63,17 @@ def _cost_per_token_custom_pricing_helper(

 def cost_per_token(
    model: str = "",
-    prompt_tokens: float = 0,
-    completion_tokens: float = 0,
+    prompt_tokens: int = 0,
+    completion_tokens: int = 0,
    response_time_ms=None,
    custom_llm_provider: Optional[str] = None,
    region_name=None,
    ### CHARACTER PRICING ###
-    prompt_characters: float = 0,
-    completion_characters: float = 0,
+    prompt_characters: int = 0,
+    completion_characters: int = 0,
+    ### PROMPT CACHING PRICING ### - used for anthropic
+    cache_creation_input_tokens: Optional[int] = 0,
+    cache_read_input_tokens: Optional[int] = 0,
    ### CUSTOM PRICING ###
    custom_cost_per_token: Optional[CostPerToken] = None,
    custom_cost_per_second: Optional[float] = None,
@ -108,6 +115,16 @@ def cost_per_token(
    """
    if model is None:
        raise Exception("Invalid arg. Model cannot be none.")
+
+    ## RECONSTRUCT USAGE BLOCK ##
+    usage_block = Usage(
+        prompt_tokens=prompt_tokens,
+        completion_tokens=completion_tokens,
+        total_tokens=prompt_tokens + completion_tokens,
+        cache_creation_input_tokens=cache_creation_input_tokens,
+        cache_read_input_tokens=cache_read_input_tokens,
+    )
+
    ## CUSTOM PRICING ##
    response_cost = _cost_per_token_custom_pricing_helper(
        prompt_tokens=prompt_tokens,
@ -137,6 +154,7 @@ def cost_per_token(
                model_with_provider = model_with_provider_and_region
    else:
        _, custom_llm_provider, _, _ = litellm.get_llm_provider(model=model)
+
    model_without_prefix = model
    model_parts = model.split("/")
    if len(model_parts) > 1:
@ -162,6 +180,7 @@ def cost_per_token(

    # see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models
    print_verbose(f"Looking up model={model} in model_cost_map")
+
    if custom_llm_provider == "vertex_ai":
        cost_router = google_cost_router(
            model=model_without_prefix,
@ -188,6 +207,8 @@ def cost_per_token(
                prompt_tokens=prompt_tokens,
                completion_tokens=completion_tokens,
            )
+    elif custom_llm_provider == "anthropic":
+        return anthropic_cost_per_token(model=model, usage=usage_block)
    elif custom_llm_provider == "gemini":
        return google_cost_per_token(
            model=model_without_prefix,
@ -520,6 +541,8 @@ def completion_cost(
        prompt_characters = 0
        completion_tokens = 0
        completion_characters = 0
+        cache_creation_input_tokens: Optional[int] = None
+        cache_read_input_tokens: Optional[int] = None
        if completion_response is not None and (
            isinstance(completion_response, BaseModel)
            or isinstance(completion_response, dict)
@ -541,6 +564,13 @@ def completion_cost(
            completion_tokens = completion_response.get("usage", {}).get(
                "completion_tokens", 0
            )
+            cache_creation_input_tokens = completion_response.get("usage", {}).get(
+                "cache_creation_input_tokens", 0
+            )
+            cache_read_input_tokens = completion_response.get("usage", {}).get(
+                "cache_read_input_tokens", 0
+            )
+
            total_time = getattr(completion_response, "_response_ms", 0)
            verbose_logger.debug(
                f"completion_response response ms: {getattr(completion_response, '_response_ms', None)} "
@ -550,7 +580,7 @@ def completion_cost(
            )
            if hasattr(completion_response, "_hidden_params"):
                custom_llm_provider = completion_response._hidden_params.get(
-                    "custom_llm_provider", custom_llm_provider or ""
+                    "custom_llm_provider", custom_llm_provider or None
                )
                region_name = completion_response._hidden_params.get(
                    "region_name", region_name
@ -697,6 +727,8 @@ def completion_cost(
            custom_cost_per_token=custom_cost_per_token,
            prompt_characters=prompt_characters,
            completion_characters=completion_characters,
+            cache_creation_input_tokens=cache_creation_input_tokens,
+            cache_read_input_tokens=cache_read_input_tokens,
            call_type=call_type,
        )
        _final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
--- a/litellm/llms/anthropic/chat.py
+++ b/litellm/llms/anthropic/chat.py
@ -1,3 +1,7 @@
+"""
+Calling + translation logic for anthropic's `/v1/messages` endpoint
+"""
+
 import copy
 import json
 import os
@ -70,8 +74,8 @@ from litellm.types.llms.openai import (
 from litellm.types.utils import Choices, GenericStreamingChunk
 from litellm.utils import CustomStreamWrapper, ModelResponse, Usage

-from .base import BaseLLM
-from .prompt_templates.factory import custom_prompt, prompt_factory
+from ..base import BaseLLM
+from ..prompt_templates.factory import custom_prompt, prompt_factory


 class AnthropicConstants(Enum):
@ -982,7 +986,7 @@ class AnthropicChatCompletion(BaseLLM):
                )
            except Exception as e:
                verbose_logger.exception(
-                    "litellm.llms.anthropic.py::completion() - Exception occurred - {}\nReceived Messages: {}".format(
+                    "litellm.llms.anthropic.chat.py::completion() - Exception occurred - {}\nReceived Messages: {}".format(
                        str(e), messages
                    )
                )
--- a/litellm/llms/anthropic/completion.py
+++ b/litellm/llms/anthropic/completion.py
@ -1,3 +1,7 @@
+"""
+Translation logic for anthropic's `/v1/complete` endpoint
+"""
+
 import json
 import os
 import time
@ -12,8 +16,8 @@ import litellm
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.utils import CustomStreamWrapper, ModelResponse, Usage

-from .base import BaseLLM
-from .prompt_templates.factory import custom_prompt, prompt_factory
+from ..base import BaseLLM
+from ..prompt_templates.factory import custom_prompt, prompt_factory


 class AnthropicConstants(Enum):
--- a/litellm/llms/anthropic/cost_calculation.py
+++ b/litellm/llms/anthropic/cost_calculation.py
@ -0,0 +1,42 @@
+"""
+Helper util for handling anthropic-specific cost calculation
+- e.g.: prompt caching
+"""
+
+from typing import Tuple
+
+from litellm.types.utils import Usage
+from litellm.utils import get_model_info
+
+
+def cost_per_token(model: str, usage: Usage) -> Tuple[float, float]:
+    """
+    Calculates the cost per token for a given model, prompt tokens, and completion tokens.
+
+    Input:
+        - model: str, the model name without provider prefix
+        - usage: LiteLLM Usage block, containing anthropic caching information
+
+    Returns:
+        Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
+    """
+    ## GET MODEL INFO
+    model_info = get_model_info(model=model, custom_llm_provider="anthropic")
+
+    ## CALCULATE INPUT COST
+
+    prompt_cost: float = usage["prompt_tokens"] * model_info["input_cost_per_token"]
+    if model_info.get("cache_creation_input_token_cost") is not None:
+        prompt_cost += (
+            usage._cache_creation_input_tokens  # type: ignore
+            * model_info["cache_creation_input_token_cost"]
+        )
+    if model_info.get("cache_read_input_token_cost") is not None:
+        prompt_cost += (
+            usage._cache_read_input_tokens * model_info["cache_read_input_token_cost"]  # type: ignore
+        )
+
+    ## CALCULATE OUTPUT COST
+    completion_cost = usage["completion_tokens"] * model_info["output_cost_per_token"]
+
+    return prompt_cost, completion_cost
--- a/litellm/llms/base.py
+++ b/litellm/llms/base.py
@ -1,11 +1,14 @@
 ## This is a template base class to be used for adding new LLM providers via API calls
+from typing import Any, Optional, Union
+
+import httpx
+import requests
+
 import litellm
-import httpx, requests
-from typing import Optional, Union
-from litellm.litellm_core_utils.litellm_logging import Logging


 class BaseLLM:
+
    _client_session: Optional[httpx.Client] = None

    def process_response(
@ -14,7 +17,7 @@ class BaseLLM:
        response: Union[requests.Response, httpx.Response],
        model_response: litellm.utils.ModelResponse,
        stream: bool,
-        logging_obj: Logging,
+        logging_obj: Any,
        optional_params: dict,
        api_key: str,
        data: Union[dict, str],
@ -33,7 +36,7 @@ class BaseLLM:
        response: Union[requests.Response, httpx.Response],
        model_response: litellm.utils.TextCompletionResponse,
        stream: bool,
-        logging_obj: Logging,
+        logging_obj: Any,
        optional_params: dict,
        api_key: str,
        data: Union[dict, str],
--- a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_anthropic.py
+++ b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_anthropic.py
@ -267,18 +267,19 @@ def completion(
 ):
    try:
        import vertexai
-        from anthropic import AnthropicVertex
-
-        from litellm.llms.anthropic import AnthropicChatCompletion
-        from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import (
-            VertexLLM,
-        )
    except:
        raise VertexAIError(
            status_code=400,
            message="""vertexai import failed please run `pip install -U google-cloud-aiplatform "anthropic[vertex]"`""",
        )

+    from anthropic import AnthropicVertex
+
+    from litellm.llms.anthropic.chat import AnthropicChatCompletion
+    from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import (
+        VertexLLM,
+    )
+
    if not (
        hasattr(vertexai, "preview") or hasattr(vertexai.preview, "language_models")
    ):
--- a/litellm/main.py
+++ b/litellm/main.py
@ -77,13 +77,10 @@ from .caching import disable_cache, enable_cache, update_cache
 from .llms import (
    ai21,
    aleph_alpha,
-    anthropic_text,
    baseten,
    bedrock,
    clarifai,
    cloudflare,
-    gemini,
-    huggingface_restapi,
    maritalk,
    nlp_cloud,
    ollama,
@ -93,13 +90,10 @@ from .llms import (
    palm,
    petals,
    replicate,
-    together_ai,
-    triton,
    vllm,
-    watsonx,
 )
-from .llms.anthropic import AnthropicChatCompletion
-from .llms.anthropic_text import AnthropicTextCompletion
+from .llms.anthropic.chat import AnthropicChatCompletion
+from .llms.anthropic.completion import AnthropicTextCompletion
 from .llms.azure import AzureChatCompletion, _check_dynamic_azure_params
 from .llms.azure_text import AzureTextCompletion
 from .llms.bedrock_httpx import BedrockConverseLLM, BedrockLLM
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -1336,6 +1336,8 @@
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.00000025,
        "output_cost_per_token": 0.00000125,
+        "cache_creation_input_token_cost": 0.0000003,
+        "cache_read_input_token_cost": 0.00000003,
        "litellm_provider": "anthropic",
        "mode": "chat",
        "supports_function_calling": true,
@ -1349,6 +1351,8 @@
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.000015,
        "output_cost_per_token": 0.000075,
+        "cache_creation_input_token_cost": 0.00001875,
+        "cache_read_input_token_cost": 0.0000015,
        "litellm_provider": "anthropic",
        "mode": "chat",
        "supports_function_calling": true,
@ -1375,6 +1379,8 @@
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000015,
+        "cache_creation_input_token_cost": 0.00000375,
+        "cache_read_input_token_cost": 0.0000003,
        "litellm_provider": "anthropic",
        "mode": "chat",
        "supports_function_calling": true,
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@ -1,4 +1,4 @@
 model_list:
-  - model_name: "gemini/*"
+  - model_name: "gpt-3.5-turbo"
    litellm_params:
-      model: "gemini/*"
+      model: "gpt-3.5-turbo"
--- a/litellm/tests/test_anthropic_completion.py
+++ b/litellm/tests/test_anthropic_completion.py
@ -10,7 +10,7 @@ from dotenv import load_dotenv

 import litellm.types
 import litellm.types.utils
-from litellm.llms.anthropic import ModelResponseIterator
+from litellm.llms.anthropic.chat import ModelResponseIterator

 load_dotenv()
 import io
@ -152,48 +152,190 @@ def test_anthropic_completion_e2e(stream):


 anthropic_chunk_list = [
-    {"type": "content_block_start", "index": 0, "content_block": {"type": "text", "text": ""}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": "To"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " answer"}},
-    {"type": "content_block_delta", "index": 0,
-     "delta": {"type": "text_delta", "text": " your question about the weather"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " in Boston and Los"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " Angeles today, I'll"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " need to"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " use"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " the"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " get_current_weather"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " function"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " for"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " both"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " cities"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": ". Let"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " me fetch"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " that"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " information"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " for"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " you."}},
+    {
+        "type": "content_block_start",
+        "index": 0,
+        "content_block": {"type": "text", "text": ""},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": "To"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " answer"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " your question about the weather"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " in Boston and Los"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " Angeles today, I'll"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " need to"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " use"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " the"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " get_current_weather"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " function"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " for"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " both"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " cities"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": ". Let"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " me fetch"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " that"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " information"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " for"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " you."},
+    },
    {"type": "content_block_stop", "index": 0},
-    {"type": "content_block_start", "index": 1,
-     "content_block": {"type": "tool_use", "id": "toolu_12345", "name": "get_current_weather", "input": {}}},
-    {"type": "content_block_delta", "index": 1, "delta": {"type": "input_json_delta", "partial_json": ""}},
-    {"type": "content_block_delta", "index": 1, "delta": {"type": "input_json_delta", "partial_json": "{\"locat"}},
-    {"type": "content_block_delta", "index": 1, "delta": {"type": "input_json_delta", "partial_json": "ion\": \"Bos"}},
-    {"type": "content_block_delta", "index": 1, "delta": {"type": "input_json_delta", "partial_json": "ton, MA\"}"}},
+    {
+        "type": "content_block_start",
+        "index": 1,
+        "content_block": {
+            "type": "tool_use",
+            "id": "toolu_12345",
+            "name": "get_current_weather",
+            "input": {},
+        },
+    },
+    {
+        "type": "content_block_delta",
+        "index": 1,
+        "delta": {"type": "input_json_delta", "partial_json": ""},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 1,
+        "delta": {"type": "input_json_delta", "partial_json": '{"locat'},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 1,
+        "delta": {"type": "input_json_delta", "partial_json": 'ion": "Bos'},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 1,
+        "delta": {"type": "input_json_delta", "partial_json": 'ton, MA"}'},
+    },
    {"type": "content_block_stop", "index": 1},
-    {"type": "content_block_start", "index": 2,
-     "content_block": {"type": "tool_use", "id": "toolu_023423423", "name": "get_current_weather", "input": {}}},
-    {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": ""}},
-    {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "{\"l"}},
-    {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "oca"}},
-    {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "tio"}},
-    {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "n\": \"Lo"}},
-    {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "s Angel"}},
-    {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "es, CA\"}"}},
+    {
+        "type": "content_block_start",
+        "index": 2,
+        "content_block": {
+            "type": "tool_use",
+            "id": "toolu_023423423",
+            "name": "get_current_weather",
+            "input": {},
+        },
+    },
+    {
+        "type": "content_block_delta",
+        "index": 2,
+        "delta": {"type": "input_json_delta", "partial_json": ""},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 2,
+        "delta": {"type": "input_json_delta", "partial_json": '{"l'},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 2,
+        "delta": {"type": "input_json_delta", "partial_json": "oca"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 2,
+        "delta": {"type": "input_json_delta", "partial_json": "tio"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 2,
+        "delta": {"type": "input_json_delta", "partial_json": 'n": "Lo'},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 2,
+        "delta": {"type": "input_json_delta", "partial_json": "s Angel"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 2,
+        "delta": {"type": "input_json_delta", "partial_json": 'es, CA"}'},
+    },
    {"type": "content_block_stop", "index": 2},
-    {"type": "message_delta", "delta": {"stop_reason": "tool_use", "stop_sequence": None},
-     "usage": {"output_tokens": 137}},
-    {"type": "message_stop"}
+    {
+        "type": "message_delta",
+        "delta": {"stop_reason": "tool_use", "stop_sequence": None},
+        "usage": {"output_tokens": 137},
+    },
+    {"type": "message_stop"},
 ]


@ -211,12 +353,12 @@ def test_anthropic_tool_streaming():
    correct_tool_index = -1
    for chunk in anthropic_chunk_list:
        parsed_chunk = response_iter.chunk_parser(chunk)
-        if tool_use := parsed_chunk.get('tool_use'):
+        if tool_use := parsed_chunk.get("tool_use"):

            # We only increment when a new block starts
-            if tool_use.get('id') is not None:
+            if tool_use.get("id") is not None:
                correct_tool_index += 1
-            assert tool_use['index'] == correct_tool_index
+            assert tool_use["index"] == correct_tool_index


@pytest.mark.asyncio
@ -344,4 +486,4 @@ def test_anthropic_tool_calling_translation():
    print(translated_params["messages"])

    assert len(translated_params["messages"]) > 0
-    assert translated_params["messages"][0]["role"] == "user"
+    assert translated_params["messages"][0]["role"] == "user"
--- a/litellm/tests/test_completion_cost.py
+++ b/litellm/tests/test_completion_cost.py
@ -1097,3 +1097,73 @@ def test_completion_cost_azure_common_deployment_name():

        print(f"mock_client.call_args: {mock_client.call_args.kwargs}")
        assert "azure/gpt-4" == mock_client.call_args.kwargs["model"]
+
+
+def test_completion_cost_anthropic_prompt_caching():
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
+    litellm.model_cost = litellm.get_model_cost_map(url="")
+
+    from litellm.utils import Choices, Message, ModelResponse, Usage
+
+    model = "anthropic/claude-3-5-sonnet-20240620"
+
+    ## WRITE TO CACHE ## (MORE EXPENSIVE)
+    response_1 = ModelResponse(
+        id="chatcmpl-3f427194-0840-4d08-b571-56bfe38a5424",
+        choices=[
+            Choices(
+                finish_reason="length",
+                index=0,
+                message=Message(
+                    content="Hello! I'm doing well, thank you for",
+                    role="assistant",
+                    tool_calls=None,
+                    function_call=None,
+                ),
+            )
+        ],
+        created=1725036547,
+        model="claude-3-5-sonnet-20240620",
+        object="chat.completion",
+        system_fingerprint=None,
+        usage=Usage(
+            completion_tokens=10,
+            prompt_tokens=14,
+            total_tokens=24,
+            cache_creation_input_tokens=100,
+            cache_read_input_tokens=0,
+        ),
+    )
+
+    ## READ FROM CACHE ## (LESS EXPENSIVE)
+    response_2 = ModelResponse(
+        id="chatcmpl-3f427194-0840-4d08-b571-56bfe38a5424",
+        choices=[
+            Choices(
+                finish_reason="length",
+                index=0,
+                message=Message(
+                    content="Hello! I'm doing well, thank you for",
+                    role="assistant",
+                    tool_calls=None,
+                    function_call=None,
+                ),
+            )
+        ],
+        created=1725036547,
+        model="claude-3-5-sonnet-20240620",
+        object="chat.completion",
+        system_fingerprint=None,
+        usage=Usage(
+            completion_tokens=10,
+            prompt_tokens=14,
+            total_tokens=24,
+            cache_creation_input_tokens=0,
+            cache_read_input_tokens=100,
+        ),
+    )
+
+    cost_1 = completion_cost(model=model, completion_response=response_1)
+    cost_2 = completion_cost(model=model, completion_response=response_2)
+
+    assert cost_1 > cost_2
--- a/litellm/tests/test_dynamic_rate_limit_handler.py
+++ b/litellm/tests/test_dynamic_rate_limit_handler.py
@ -290,6 +290,7 @@ async def test_base_case(dynamic_rate_limit_handler, mock_response):


@pytest.mark.asyncio
+@pytest.mark.flaky(retries=3, delay=1)
 async def test_update_cache(
    dynamic_rate_limit_handler, mock_response, user_api_key_auth
 ):
--- a/litellm/tests/test_optional_params.py
+++ b/litellm/tests/test_optional_params.py
@ -75,6 +75,16 @@ def test_bedrock_optional_params_embeddings():
    assert len(optional_params) == 0


+def test_google_ai_studio_optional_params_embeddings():
+    optional_params = get_optional_params_embeddings(
+        user="John",
+        encoding_format=None,
+        custom_llm_provider="gemini",
+        drop_params=True,
+    )
+    assert len(optional_params) == 0
+
+
 def test_openai_optional_params_embeddings():
    litellm.drop_params = True
    optional_params = get_optional_params_embeddings(
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@ -51,6 +51,8 @@ class ModelInfo(TypedDict, total=False):
    max_input_tokens: Required[Optional[int]]
    max_output_tokens: Required[Optional[int]]
    input_cost_per_token: Required[float]
+    cache_creation_input_token_cost: Optional[float]
+    cache_read_input_token_cost: Optional[float]
    input_cost_per_character: Optional[float]  # only for vertex ai models
    input_cost_per_token_above_128k_tokens: Optional[float]  # only for vertex ai models
    input_cost_per_character_above_128k_tokens: Optional[
@ -454,6 +456,13 @@ class Choices(OpenAIObject):


 class Usage(CompletionUsage):
+    _cache_creation_input_tokens: int = PrivateAttr(
+        0
+    )  # hidden param for prompt caching. Might change, once openai introduces their equivalent.
+    _cache_read_input_tokens: int = PrivateAttr(
+        0
+    )  # hidden param for prompt caching. Might change, once openai introduces their equivalent.
+
    def __init__(
        self,
        prompt_tokens: Optional[int] = None,
@ -466,9 +475,18 @@ class Usage(CompletionUsage):
            "completion_tokens": completion_tokens or 0,
            "total_tokens": total_tokens or 0,
        }
-
        super().__init__(**data)

+        if "cache_creation_input_tokens" in params and isinstance(
+            params["cache_creation_input_tokens"], int
+        ):
+            self._cache_creation_input_tokens = params["cache_creation_input_tokens"]
+
+        if "cache_read_input_tokens" in params and isinstance(
+            params["cache_read_input_tokens"], int
+        ):
+            self._cache_read_input_tokens = params["cache_read_input_tokens"]
+
        for k, v in params.items():
            setattr(self, k, v)

--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -2550,6 +2550,7 @@ def get_optional_params_embeddings(
    encoding_format=None,
    dimensions=None,
    custom_llm_provider="",
+    drop_params: Optional[bool] = None,
    additional_drop_params: Optional[bool] = None,
    **kwargs,
 ):
@ -2560,6 +2561,7 @@ def get_optional_params_embeddings(
    for k, v in special_params.items():
        passed_params[k] = v

+    drop_params = passed_params.pop("drop_params", None)
    additional_drop_params = passed_params.pop("additional_drop_params", None)

    default_params = {"user": None, "encoding_format": None, "dimensions": None}
@ -2571,11 +2573,16 @@ def get_optional_params_embeddings(
        for k in non_default_params.keys():
            if k not in supported_params:
                unsupported_params[k] = non_default_params[k]
-        if unsupported_params and not litellm.drop_params:
-            raise UnsupportedParamsError(
-                status_code=500,
-                message=f"{custom_llm_provider} does not support parameters: {unsupported_params}, for model={model}. To drop these, set `litellm.drop_params=True` or for proxy:\n\n`litellm_settings:\n drop_params: true`\n",
-            )
+        if unsupported_params:
+            if litellm.drop_params is True or (
+                drop_params is not None and drop_params is True
+            ):
+                pass
+            else:
+                raise UnsupportedParamsError(
+                    status_code=500,
+                    message=f"{custom_llm_provider} does not support parameters: {unsupported_params}, for model={model}. To drop these, set `litellm.drop_params=True` or for proxy:\n\n`litellm_settings:\n drop_params: true`\n",
+                )

    non_default_params = _get_non_default_params(
        passed_params=passed_params,
@ -2680,7 +2687,9 @@ def get_optional_params_embeddings(
        and custom_llm_provider not in litellm.openai_compatible_providers
    ):
        if len(non_default_params.keys()) > 0:
-            if litellm.drop_params is True:  # drop the unsupported non-default values
+            if (
+                litellm.drop_params is True or drop_params is True
+            ):  # drop the unsupported non-default values
                keys = list(non_default_params.keys())
                for k in keys:
                    non_default_params.pop(k, None)
@ -5335,6 +5344,12 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
                max_input_tokens=_model_info.get("max_input_tokens", None),
                max_output_tokens=_model_info.get("max_output_tokens", None),
                input_cost_per_token=_input_cost_per_token,
+                cache_creation_input_token_cost=_model_info.get(
+                    "cache_creation_input_token_cost", None
+                ),
+                cache_read_input_token_cost=_model_info.get(
+                    "cache_read_input_token_cost", None
+                ),
                input_cost_per_character=_model_info.get(
                    "input_cost_per_character", None
                ),
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -1336,6 +1336,8 @@
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.00000025,
        "output_cost_per_token": 0.00000125,
+        "cache_creation_input_token_cost": 0.0000003,
+        "cache_read_input_token_cost": 0.00000003,
        "litellm_provider": "anthropic",
        "mode": "chat",
        "supports_function_calling": true,
@ -1349,6 +1351,8 @@
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.000015,
        "output_cost_per_token": 0.000075,
+        "cache_creation_input_token_cost": 0.00001875,
+        "cache_read_input_token_cost": 0.0000015,
        "litellm_provider": "anthropic",
        "mode": "chat",
        "supports_function_calling": true,
@ -1375,6 +1379,8 @@
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000015,
+        "cache_creation_input_token_cost": 0.00000375,
+        "cache_read_input_token_cost": 0.0000003,
        "litellm_provider": "anthropic",
        "mode": "chat",
        "supports_function_calling": true,