From 6ab601432b2d6113725544f08702ff56005fc162 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Sat, 31 Aug 2024 10:05:23 -0700
Subject: [PATCH 1/4] feat prometheus add metric for failure / model

---
 litellm/integrations/prometheus.py | 19 ++++++++++++++++++-
 litellm/proxy/proxy_config.yaml    | 15 +++------------
 2 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/litellm/integrations/prometheus.py b/litellm/integrations/prometheus.py
index 642a776b6..ed5035074 100644
--- a/litellm/integrations/prometheus.py
+++ b/litellm/integrations/prometheus.py
@@ -138,6 +138,13 @@ class PrometheusLogger(CustomLogger):
                 labelnames=["hashed_api_key", "api_key_alias", "model"],
             )
 
+            # New metric for tracking error codes and models
+            self.litellm_error_code_metric = Counter(
+                "litellm_error_code_metric",
+                "Total number of errors by error code and model",
+                labelnames=["error_code", "model"],
+            )
+
             # Litellm-Enterprise Metrics
             if premium_user is True:
 
@@ -378,7 +385,7 @@ class PrometheusLogger(CustomLogger):
         from litellm.proxy.proxy_server import premium_user
 
         verbose_logger.debug(
-            f"prometheus Logging - Enters success logging function for kwargs {kwargs}"
+            f"prometheus Logging - Enters failure logging function for kwargs {kwargs}"
         )
 
         # unpack kwargs
@@ -409,6 +416,16 @@ class PrometheusLogger(CustomLogger):
                 user_id,
             ).inc()
             self.set_llm_deployment_failure_metrics(kwargs)
+
+            _exception = kwargs.get("exception", None)
+            error_code = "unknown"
+            if _exception is not None and hasattr(_exception, "status_code"):
+                error_code = _exception.status_code
+
+            # Increment the new error code metric
+            self.litellm_error_code_metric.labels(
+                error_code=error_code, model=model
+            ).inc()
         except Exception as e:
             verbose_logger.exception(
                 "prometheus Layer Error(): Exception occured - {}".format(str(e))
diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml
index c5f736bac..e97833421 100644
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@@ -3,21 +3,12 @@ model_list:
    litellm_params:
      model: openai/fake
      api_key: fake-key
-     api_base: https://exampleopenaiendpoint-production.up.railway.app/
-     tags: ["teamA"] # 👈 Key Change
    model_info:
      id: "team-a-model" # used for identifying model in response headers
- - model_name: fake-openai-endpoint
-   litellm_params:
-     model: openai/fake
-     api_key: fake-key
-     api_base: https://exampleopenaiendpoint-production.up.railway.app/
-     tags: ["teamB"] # 👈 Key Change
-   model_info:
-     id: "team-b-model" # used for identifying model in response headers
 
-router_settings:
- enable_tag_filtering: True # 👈 Key Change
+litellm_settings:
+  success_callback: ["prometheus"]
+  failure_callback: ["prometheus"]
 
 general_settings: 
  master_key: sk-1234 
\ No newline at end of file

From e6faaba56e83585995dd0b6ce8ac89c29d1f12c7 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Sat, 31 Aug 2024 14:46:41 -0700
Subject: [PATCH 2/4] docs add litellm_error_code_metric_total

---
 docs/my-website/docs/proxy/prometheus.md | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/docs/my-website/docs/proxy/prometheus.md b/docs/my-website/docs/proxy/prometheus.md
index 10e6456c2..6ccf0e44e 100644
--- a/docs/my-website/docs/proxy/prometheus.md
+++ b/docs/my-website/docs/proxy/prometheus.md
@@ -58,6 +58,15 @@ http://localhost:4000/metrics
 
 ## 📈 Metrics Tracked 
 
+### Error Metrics
+
+| Metric Name          | Description                          |
+|----------------------|--------------------------------------|
+| `litellm_error_code_metric_total`             | Total number of errors by error code and model |
+
+This metric provides a count of errors encountered, categorized by error code and model. For example:
+
+
 
 ### Proxy Requests / Spend Metrics
 
@@ -66,7 +75,12 @@ http://localhost:4000/metrics
 | `litellm_requests_metric`             | Number of requests made, per `"user", "key", "model", "team", "end-user"`          |
 | `litellm_spend_metric`                | Total Spend, per `"user", "key", "model", "team", "end-user"`                 |
 | `litellm_total_tokens`         | input + output tokens per `"user", "key", "model", "team", "end-user"`     |
+
+### Error Monitoring Metrics
+
+| Metric Name          | Description                          |
 | `litellm_llm_api_failed_requests_metric`   | Number of failed LLM API requests per `"user", "key", "model", "team", "end-user"`    |
+| `litellm_error_code_metric_total`             | Total number of errors by error code and model |
 
 ### Request Latency Metrics 
 

From 65a9c933adaae9daca0a59e5bd1ac42dd5be2a41 Mon Sep 17 00:00:00 2001
From: Krish Dholakia <krrishdholakia@gmail.com>
Date: Sat, 31 Aug 2024 14:09:35 -0700
Subject: [PATCH 3/4] anthropic prompt caching cost tracking (#5453)

* fix(utils.py): support 'drop_params' for embedding requests

Fixes https://github.com/BerriAI/litellm/issues/5444

* feat(anthropic/cost_calculation.py): Support calculating cost for prompt caching on anthropic

* feat(types/utils.py): allows us to migrate to openai's equivalent, once that comes out

* fix: fix linting errors

* test: mark flaky test
---
 litellm/__init__.py                           |   4 +-
 litellm/cost_calculator.py                    |  42 +++-
 .../llms/{anthropic.py => anthropic/chat.py}  |  10 +-
 .../completion.py}                            |   8 +-
 litellm/llms/anthropic/cost_calculation.py    |  42 ++++
 litellm/llms/base.py                          |  13 +-
 .../vertex_ai_anthropic.py                    |  13 +-
 litellm/main.py                               |  10 +-
 ...odel_prices_and_context_window_backup.json |   6 +
 litellm/proxy/_new_secret_config.yaml         |   4 +-
 litellm/tests/test_anthropic_completion.py    | 230 ++++++++++++++----
 litellm/tests/test_completion_cost.py         |  70 ++++++
 .../tests/test_dynamic_rate_limit_handler.py  |   1 +
 litellm/tests/test_optional_params.py         |  10 +
 litellm/types/utils.py                        |  20 +-
 litellm/utils.py                              |  27 +-
 model_prices_and_context_window.json          |   6 +
 17 files changed, 432 insertions(+), 84 deletions(-)
 rename litellm/llms/{anthropic.py => anthropic/chat.py} (99%)
 rename litellm/llms/{anthropic_text.py => anthropic/completion.py} (98%)
 create mode 100644 litellm/llms/anthropic/cost_calculation.py

diff --git a/litellm/__init__.py b/litellm/__init__.py
index 1c3b8434f..0436e039c 100644
--- a/litellm/__init__.py
+++ b/litellm/__init__.py
@@ -841,10 +841,10 @@ ALL_LITELLM_RESPONSE_TYPES = [
 from .types.utils import ImageObject
 from .llms.custom_llm import CustomLLM
 from .llms.huggingface_restapi import HuggingfaceConfig
-from .llms.anthropic import AnthropicConfig
+from .llms.anthropic.chat import AnthropicConfig
+from .llms.anthropic.completion import AnthropicTextConfig
 from .llms.databricks import DatabricksConfig, DatabricksEmbeddingConfig
 from .llms.predibase import PredibaseConfig
-from .llms.anthropic_text import AnthropicTextConfig
 from .llms.replicate import ReplicateConfig
 from .llms.cohere.completion import CohereConfig
 from .llms.clarifai import ClarifaiConfig
diff --git a/litellm/cost_calculator.py b/litellm/cost_calculator.py
index 3c025055e..a0645c19a 100644
--- a/litellm/cost_calculator.py
+++ b/litellm/cost_calculator.py
@@ -19,8 +19,12 @@ from litellm.litellm_core_utils.llm_cost_calc.google import (
     cost_router as google_cost_router,
 )
 from litellm.litellm_core_utils.llm_cost_calc.utils import _generic_cost_per_character
+from litellm.llms.anthropic.cost_calculation import (
+    cost_per_token as anthropic_cost_per_token,
+)
 from litellm.types.llms.openai import HttpxBinaryResponseContent
 from litellm.types.router import SPECIAL_MODEL_INFO_PARAMS
+from litellm.types.utils import Usage
 from litellm.utils import (
     CallTypes,
     CostPerToken,
@@ -59,14 +63,17 @@ def _cost_per_token_custom_pricing_helper(
 
 def cost_per_token(
     model: str = "",
-    prompt_tokens: float = 0,
-    completion_tokens: float = 0,
+    prompt_tokens: int = 0,
+    completion_tokens: int = 0,
     response_time_ms=None,
     custom_llm_provider: Optional[str] = None,
     region_name=None,
     ### CHARACTER PRICING ###
-    prompt_characters: float = 0,
-    completion_characters: float = 0,
+    prompt_characters: int = 0,
+    completion_characters: int = 0,
+    ### PROMPT CACHING PRICING ### - used for anthropic
+    cache_creation_input_tokens: Optional[int] = 0,
+    cache_read_input_tokens: Optional[int] = 0,
     ### CUSTOM PRICING ###
     custom_cost_per_token: Optional[CostPerToken] = None,
     custom_cost_per_second: Optional[float] = None,
@@ -108,6 +115,16 @@ def cost_per_token(
     """
     if model is None:
         raise Exception("Invalid arg. Model cannot be none.")
+
+    ## RECONSTRUCT USAGE BLOCK ##
+    usage_block = Usage(
+        prompt_tokens=prompt_tokens,
+        completion_tokens=completion_tokens,
+        total_tokens=prompt_tokens + completion_tokens,
+        cache_creation_input_tokens=cache_creation_input_tokens,
+        cache_read_input_tokens=cache_read_input_tokens,
+    )
+
     ## CUSTOM PRICING ##
     response_cost = _cost_per_token_custom_pricing_helper(
         prompt_tokens=prompt_tokens,
@@ -137,6 +154,7 @@ def cost_per_token(
                 model_with_provider = model_with_provider_and_region
     else:
         _, custom_llm_provider, _, _ = litellm.get_llm_provider(model=model)
+
     model_without_prefix = model
     model_parts = model.split("/")
     if len(model_parts) > 1:
@@ -162,6 +180,7 @@ def cost_per_token(
 
     # see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models
     print_verbose(f"Looking up model={model} in model_cost_map")
+
     if custom_llm_provider == "vertex_ai":
         cost_router = google_cost_router(
             model=model_without_prefix,
@@ -188,6 +207,8 @@ def cost_per_token(
                 prompt_tokens=prompt_tokens,
                 completion_tokens=completion_tokens,
             )
+    elif custom_llm_provider == "anthropic":
+        return anthropic_cost_per_token(model=model, usage=usage_block)
     elif custom_llm_provider == "gemini":
         return google_cost_per_token(
             model=model_without_prefix,
@@ -520,6 +541,8 @@ def completion_cost(
         prompt_characters = 0
         completion_tokens = 0
         completion_characters = 0
+        cache_creation_input_tokens: Optional[int] = None
+        cache_read_input_tokens: Optional[int] = None
         if completion_response is not None and (
             isinstance(completion_response, BaseModel)
             or isinstance(completion_response, dict)
@@ -541,6 +564,13 @@ def completion_cost(
             completion_tokens = completion_response.get("usage", {}).get(
                 "completion_tokens", 0
             )
+            cache_creation_input_tokens = completion_response.get("usage", {}).get(
+                "cache_creation_input_tokens", 0
+            )
+            cache_read_input_tokens = completion_response.get("usage", {}).get(
+                "cache_read_input_tokens", 0
+            )
+
             total_time = getattr(completion_response, "_response_ms", 0)
             verbose_logger.debug(
                 f"completion_response response ms: {getattr(completion_response, '_response_ms', None)} "
@@ -550,7 +580,7 @@ def completion_cost(
             )
             if hasattr(completion_response, "_hidden_params"):
                 custom_llm_provider = completion_response._hidden_params.get(
-                    "custom_llm_provider", custom_llm_provider or ""
+                    "custom_llm_provider", custom_llm_provider or None
                 )
                 region_name = completion_response._hidden_params.get(
                     "region_name", region_name
@@ -697,6 +727,8 @@ def completion_cost(
             custom_cost_per_token=custom_cost_per_token,
             prompt_characters=prompt_characters,
             completion_characters=completion_characters,
+            cache_creation_input_tokens=cache_creation_input_tokens,
+            cache_read_input_tokens=cache_read_input_tokens,
             call_type=call_type,
         )
         _final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
diff --git a/litellm/llms/anthropic.py b/litellm/llms/anthropic/chat.py
similarity index 99%
rename from litellm/llms/anthropic.py
rename to litellm/llms/anthropic/chat.py
index 813897c66..f62c7246e 100644
--- a/litellm/llms/anthropic.py
+++ b/litellm/llms/anthropic/chat.py
@@ -1,3 +1,7 @@
+"""
+Calling + translation logic for anthropic's `/v1/messages` endpoint
+"""
+
 import copy
 import json
 import os
@@ -70,8 +74,8 @@ from litellm.types.llms.openai import (
 from litellm.types.utils import Choices, GenericStreamingChunk
 from litellm.utils import CustomStreamWrapper, ModelResponse, Usage
 
-from .base import BaseLLM
-from .prompt_templates.factory import custom_prompt, prompt_factory
+from ..base import BaseLLM
+from ..prompt_templates.factory import custom_prompt, prompt_factory
 
 
 class AnthropicConstants(Enum):
@@ -982,7 +986,7 @@ class AnthropicChatCompletion(BaseLLM):
                 )
             except Exception as e:
                 verbose_logger.exception(
-                    "litellm.llms.anthropic.py::completion() - Exception occurred - {}\nReceived Messages: {}".format(
+                    "litellm.llms.anthropic.chat.py::completion() - Exception occurred - {}\nReceived Messages: {}".format(
                         str(e), messages
                     )
                 )
diff --git a/litellm/llms/anthropic_text.py b/litellm/llms/anthropic/completion.py
similarity index 98%
rename from litellm/llms/anthropic_text.py
rename to litellm/llms/anthropic/completion.py
index d20e49daf..dd2d47e53 100644
--- a/litellm/llms/anthropic_text.py
+++ b/litellm/llms/anthropic/completion.py
@@ -1,3 +1,7 @@
+"""
+Translation logic for anthropic's `/v1/complete` endpoint
+"""
+
 import json
 import os
 import time
@@ -12,8 +16,8 @@ import litellm
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.utils import CustomStreamWrapper, ModelResponse, Usage
 
-from .base import BaseLLM
-from .prompt_templates.factory import custom_prompt, prompt_factory
+from ..base import BaseLLM
+from ..prompt_templates.factory import custom_prompt, prompt_factory
 
 
 class AnthropicConstants(Enum):
diff --git a/litellm/llms/anthropic/cost_calculation.py b/litellm/llms/anthropic/cost_calculation.py
new file mode 100644
index 000000000..d1742aae9
--- /dev/null
+++ b/litellm/llms/anthropic/cost_calculation.py
@@ -0,0 +1,42 @@
+"""
+Helper util for handling anthropic-specific cost calculation
+- e.g.: prompt caching
+"""
+
+from typing import Tuple
+
+from litellm.types.utils import Usage
+from litellm.utils import get_model_info
+
+
+def cost_per_token(model: str, usage: Usage) -> Tuple[float, float]:
+    """
+    Calculates the cost per token for a given model, prompt tokens, and completion tokens.
+
+    Input:
+        - model: str, the model name without provider prefix
+        - usage: LiteLLM Usage block, containing anthropic caching information
+
+    Returns:
+        Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
+    """
+    ## GET MODEL INFO
+    model_info = get_model_info(model=model, custom_llm_provider="anthropic")
+
+    ## CALCULATE INPUT COST
+
+    prompt_cost: float = usage["prompt_tokens"] * model_info["input_cost_per_token"]
+    if model_info.get("cache_creation_input_token_cost") is not None:
+        prompt_cost += (
+            usage._cache_creation_input_tokens  # type: ignore
+            * model_info["cache_creation_input_token_cost"]
+        )
+    if model_info.get("cache_read_input_token_cost") is not None:
+        prompt_cost += (
+            usage._cache_read_input_tokens * model_info["cache_read_input_token_cost"]  # type: ignore
+        )
+
+    ## CALCULATE OUTPUT COST
+    completion_cost = usage["completion_tokens"] * model_info["output_cost_per_token"]
+
+    return prompt_cost, completion_cost
diff --git a/litellm/llms/base.py b/litellm/llms/base.py
index 7e80de9ab..08c5e1992 100644
--- a/litellm/llms/base.py
+++ b/litellm/llms/base.py
@@ -1,11 +1,14 @@
 ## This is a template base class to be used for adding new LLM providers via API calls
+from typing import Any, Optional, Union
+
+import httpx
+import requests
+
 import litellm
-import httpx, requests
-from typing import Optional, Union
-from litellm.litellm_core_utils.litellm_logging import Logging
 
 
 class BaseLLM:
+
     _client_session: Optional[httpx.Client] = None
 
     def process_response(
@@ -14,7 +17,7 @@ class BaseLLM:
         response: Union[requests.Response, httpx.Response],
         model_response: litellm.utils.ModelResponse,
         stream: bool,
-        logging_obj: Logging,
+        logging_obj: Any,
         optional_params: dict,
         api_key: str,
         data: Union[dict, str],
@@ -33,7 +36,7 @@ class BaseLLM:
         response: Union[requests.Response, httpx.Response],
         model_response: litellm.utils.TextCompletionResponse,
         stream: bool,
-        logging_obj: Logging,
+        logging_obj: Any,
         optional_params: dict,
         api_key: str,
         data: Union[dict, str],
diff --git a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_anthropic.py b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_anthropic.py
index e85160a43..025b27240 100644
--- a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_anthropic.py
+++ b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_anthropic.py
@@ -267,18 +267,19 @@ def completion(
 ):
     try:
         import vertexai
-        from anthropic import AnthropicVertex
-
-        from litellm.llms.anthropic import AnthropicChatCompletion
-        from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import (
-            VertexLLM,
-        )
     except:
         raise VertexAIError(
             status_code=400,
             message="""vertexai import failed please run `pip install -U google-cloud-aiplatform "anthropic[vertex]"`""",
         )
 
+    from anthropic import AnthropicVertex
+
+    from litellm.llms.anthropic.chat import AnthropicChatCompletion
+    from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import (
+        VertexLLM,
+    )
+
     if not (
         hasattr(vertexai, "preview") or hasattr(vertexai.preview, "language_models")
     ):
diff --git a/litellm/main.py b/litellm/main.py
index 95a106377..f9ef4a419 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -77,13 +77,10 @@ from .caching import disable_cache, enable_cache, update_cache
 from .llms import (
     ai21,
     aleph_alpha,
-    anthropic_text,
     baseten,
     bedrock,
     clarifai,
     cloudflare,
-    gemini,
-    huggingface_restapi,
     maritalk,
     nlp_cloud,
     ollama,
@@ -93,13 +90,10 @@ from .llms import (
     palm,
     petals,
     replicate,
-    together_ai,
-    triton,
     vllm,
-    watsonx,
 )
-from .llms.anthropic import AnthropicChatCompletion
-from .llms.anthropic_text import AnthropicTextCompletion
+from .llms.anthropic.chat import AnthropicChatCompletion
+from .llms.anthropic.completion import AnthropicTextCompletion
 from .llms.azure import AzureChatCompletion, _check_dynamic_azure_params
 from .llms.azure_text import AzureTextCompletion
 from .llms.bedrock_httpx import BedrockConverseLLM, BedrockLLM
diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json
index 5e6d0f2ab..a60743c65 100644
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@@ -1336,6 +1336,8 @@
         "max_output_tokens": 4096,
         "input_cost_per_token": 0.00000025,
         "output_cost_per_token": 0.00000125,
+        "cache_creation_input_token_cost": 0.0000003,
+        "cache_read_input_token_cost": 0.00000003,
         "litellm_provider": "anthropic",
         "mode": "chat",
         "supports_function_calling": true,
@@ -1349,6 +1351,8 @@
         "max_output_tokens": 4096,
         "input_cost_per_token": 0.000015,
         "output_cost_per_token": 0.000075,
+        "cache_creation_input_token_cost": 0.00001875,
+        "cache_read_input_token_cost": 0.0000015,
         "litellm_provider": "anthropic",
         "mode": "chat",
         "supports_function_calling": true,
@@ -1375,6 +1379,8 @@
         "max_output_tokens": 4096,
         "input_cost_per_token": 0.000003,
         "output_cost_per_token": 0.000015,
+        "cache_creation_input_token_cost": 0.00000375,
+        "cache_read_input_token_cost": 0.0000003,
         "litellm_provider": "anthropic",
         "mode": "chat",
         "supports_function_calling": true,
diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml
index b8f964ab3..b84ef7453 100644
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@@ -1,4 +1,4 @@
 model_list:
-  - model_name: "gemini/*"
+  - model_name: "gpt-3.5-turbo"
     litellm_params:
-      model: "gemini/*"
\ No newline at end of file
+      model: "gpt-3.5-turbo"
\ No newline at end of file
diff --git a/litellm/tests/test_anthropic_completion.py b/litellm/tests/test_anthropic_completion.py
index b5e01d448..b8ccf716e 100644
--- a/litellm/tests/test_anthropic_completion.py
+++ b/litellm/tests/test_anthropic_completion.py
@@ -10,7 +10,7 @@ from dotenv import load_dotenv
 
 import litellm.types
 import litellm.types.utils
-from litellm.llms.anthropic import ModelResponseIterator
+from litellm.llms.anthropic.chat import ModelResponseIterator
 
 load_dotenv()
 import io
@@ -152,48 +152,190 @@ def test_anthropic_completion_e2e(stream):
 
 
 anthropic_chunk_list = [
-    {"type": "content_block_start", "index": 0, "content_block": {"type": "text", "text": ""}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": "To"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " answer"}},
-    {"type": "content_block_delta", "index": 0,
-     "delta": {"type": "text_delta", "text": " your question about the weather"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " in Boston and Los"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " Angeles today, I'll"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " need to"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " use"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " the"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " get_current_weather"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " function"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " for"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " both"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " cities"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": ". Let"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " me fetch"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " that"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " information"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " for"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " you."}},
+    {
+        "type": "content_block_start",
+        "index": 0,
+        "content_block": {"type": "text", "text": ""},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": "To"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " answer"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " your question about the weather"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " in Boston and Los"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " Angeles today, I'll"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " need to"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " use"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " the"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " get_current_weather"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " function"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " for"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " both"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " cities"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": ". Let"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " me fetch"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " that"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " information"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " for"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " you."},
+    },
     {"type": "content_block_stop", "index": 0},
-    {"type": "content_block_start", "index": 1,
-     "content_block": {"type": "tool_use", "id": "toolu_12345", "name": "get_current_weather", "input": {}}},
-    {"type": "content_block_delta", "index": 1, "delta": {"type": "input_json_delta", "partial_json": ""}},
-    {"type": "content_block_delta", "index": 1, "delta": {"type": "input_json_delta", "partial_json": "{\"locat"}},
-    {"type": "content_block_delta", "index": 1, "delta": {"type": "input_json_delta", "partial_json": "ion\": \"Bos"}},
-    {"type": "content_block_delta", "index": 1, "delta": {"type": "input_json_delta", "partial_json": "ton, MA\"}"}},
+    {
+        "type": "content_block_start",
+        "index": 1,
+        "content_block": {
+            "type": "tool_use",
+            "id": "toolu_12345",
+            "name": "get_current_weather",
+            "input": {},
+        },
+    },
+    {
+        "type": "content_block_delta",
+        "index": 1,
+        "delta": {"type": "input_json_delta", "partial_json": ""},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 1,
+        "delta": {"type": "input_json_delta", "partial_json": '{"locat'},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 1,
+        "delta": {"type": "input_json_delta", "partial_json": 'ion": "Bos'},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 1,
+        "delta": {"type": "input_json_delta", "partial_json": 'ton, MA"}'},
+    },
     {"type": "content_block_stop", "index": 1},
-    {"type": "content_block_start", "index": 2,
-     "content_block": {"type": "tool_use", "id": "toolu_023423423", "name": "get_current_weather", "input": {}}},
-    {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": ""}},
-    {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "{\"l"}},
-    {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "oca"}},
-    {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "tio"}},
-    {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "n\": \"Lo"}},
-    {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "s Angel"}},
-    {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "es, CA\"}"}},
+    {
+        "type": "content_block_start",
+        "index": 2,
+        "content_block": {
+            "type": "tool_use",
+            "id": "toolu_023423423",
+            "name": "get_current_weather",
+            "input": {},
+        },
+    },
+    {
+        "type": "content_block_delta",
+        "index": 2,
+        "delta": {"type": "input_json_delta", "partial_json": ""},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 2,
+        "delta": {"type": "input_json_delta", "partial_json": '{"l'},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 2,
+        "delta": {"type": "input_json_delta", "partial_json": "oca"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 2,
+        "delta": {"type": "input_json_delta", "partial_json": "tio"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 2,
+        "delta": {"type": "input_json_delta", "partial_json": 'n": "Lo'},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 2,
+        "delta": {"type": "input_json_delta", "partial_json": "s Angel"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 2,
+        "delta": {"type": "input_json_delta", "partial_json": 'es, CA"}'},
+    },
     {"type": "content_block_stop", "index": 2},
-    {"type": "message_delta", "delta": {"stop_reason": "tool_use", "stop_sequence": None},
-     "usage": {"output_tokens": 137}},
-    {"type": "message_stop"}
+    {
+        "type": "message_delta",
+        "delta": {"stop_reason": "tool_use", "stop_sequence": None},
+        "usage": {"output_tokens": 137},
+    },
+    {"type": "message_stop"},
 ]
 
 
@@ -211,12 +353,12 @@ def test_anthropic_tool_streaming():
     correct_tool_index = -1
     for chunk in anthropic_chunk_list:
         parsed_chunk = response_iter.chunk_parser(chunk)
-        if tool_use := parsed_chunk.get('tool_use'):
+        if tool_use := parsed_chunk.get("tool_use"):
 
             # We only increment when a new block starts
-            if tool_use.get('id') is not None:
+            if tool_use.get("id") is not None:
                 correct_tool_index += 1
-            assert tool_use['index'] == correct_tool_index
+            assert tool_use["index"] == correct_tool_index
 
 
 @pytest.mark.asyncio
@@ -344,4 +486,4 @@ def test_anthropic_tool_calling_translation():
     print(translated_params["messages"])
 
     assert len(translated_params["messages"]) > 0
-    assert translated_params["messages"][0]["role"] == "user"
\ No newline at end of file
+    assert translated_params["messages"][0]["role"] == "user"
diff --git a/litellm/tests/test_completion_cost.py b/litellm/tests/test_completion_cost.py
index e9326752f..f48a85cad 100644
--- a/litellm/tests/test_completion_cost.py
+++ b/litellm/tests/test_completion_cost.py
@@ -1097,3 +1097,73 @@ def test_completion_cost_azure_common_deployment_name():
 
         print(f"mock_client.call_args: {mock_client.call_args.kwargs}")
         assert "azure/gpt-4" == mock_client.call_args.kwargs["model"]
+
+
+def test_completion_cost_anthropic_prompt_caching():
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
+    litellm.model_cost = litellm.get_model_cost_map(url="")
+
+    from litellm.utils import Choices, Message, ModelResponse, Usage
+
+    model = "anthropic/claude-3-5-sonnet-20240620"
+
+    ## WRITE TO CACHE ## (MORE EXPENSIVE)
+    response_1 = ModelResponse(
+        id="chatcmpl-3f427194-0840-4d08-b571-56bfe38a5424",
+        choices=[
+            Choices(
+                finish_reason="length",
+                index=0,
+                message=Message(
+                    content="Hello! I'm doing well, thank you for",
+                    role="assistant",
+                    tool_calls=None,
+                    function_call=None,
+                ),
+            )
+        ],
+        created=1725036547,
+        model="claude-3-5-sonnet-20240620",
+        object="chat.completion",
+        system_fingerprint=None,
+        usage=Usage(
+            completion_tokens=10,
+            prompt_tokens=14,
+            total_tokens=24,
+            cache_creation_input_tokens=100,
+            cache_read_input_tokens=0,
+        ),
+    )
+
+    ## READ FROM CACHE ## (LESS EXPENSIVE)
+    response_2 = ModelResponse(
+        id="chatcmpl-3f427194-0840-4d08-b571-56bfe38a5424",
+        choices=[
+            Choices(
+                finish_reason="length",
+                index=0,
+                message=Message(
+                    content="Hello! I'm doing well, thank you for",
+                    role="assistant",
+                    tool_calls=None,
+                    function_call=None,
+                ),
+            )
+        ],
+        created=1725036547,
+        model="claude-3-5-sonnet-20240620",
+        object="chat.completion",
+        system_fingerprint=None,
+        usage=Usage(
+            completion_tokens=10,
+            prompt_tokens=14,
+            total_tokens=24,
+            cache_creation_input_tokens=0,
+            cache_read_input_tokens=100,
+        ),
+    )
+
+    cost_1 = completion_cost(model=model, completion_response=response_1)
+    cost_2 = completion_cost(model=model, completion_response=response_2)
+
+    assert cost_1 > cost_2
diff --git a/litellm/tests/test_dynamic_rate_limit_handler.py b/litellm/tests/test_dynamic_rate_limit_handler.py
index f49a760af..d711de71f 100644
--- a/litellm/tests/test_dynamic_rate_limit_handler.py
+++ b/litellm/tests/test_dynamic_rate_limit_handler.py
@@ -290,6 +290,7 @@ async def test_base_case(dynamic_rate_limit_handler, mock_response):
 
 
 @pytest.mark.asyncio
+@pytest.mark.flaky(retries=3, delay=1)
 async def test_update_cache(
     dynamic_rate_limit_handler, mock_response, user_api_key_auth
 ):
diff --git a/litellm/tests/test_optional_params.py b/litellm/tests/test_optional_params.py
index e8bc999f2..54e2e5b43 100644
--- a/litellm/tests/test_optional_params.py
+++ b/litellm/tests/test_optional_params.py
@@ -75,6 +75,16 @@ def test_bedrock_optional_params_embeddings():
     assert len(optional_params) == 0
 
 
+def test_google_ai_studio_optional_params_embeddings():
+    optional_params = get_optional_params_embeddings(
+        user="John",
+        encoding_format=None,
+        custom_llm_provider="gemini",
+        drop_params=True,
+    )
+    assert len(optional_params) == 0
+
+
 def test_openai_optional_params_embeddings():
     litellm.drop_params = True
     optional_params = get_optional_params_embeddings(
diff --git a/litellm/types/utils.py b/litellm/types/utils.py
index 81dc268af..aadbdd22a 100644
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@@ -51,6 +51,8 @@ class ModelInfo(TypedDict, total=False):
     max_input_tokens: Required[Optional[int]]
     max_output_tokens: Required[Optional[int]]
     input_cost_per_token: Required[float]
+    cache_creation_input_token_cost: Optional[float]
+    cache_read_input_token_cost: Optional[float]
     input_cost_per_character: Optional[float]  # only for vertex ai models
     input_cost_per_token_above_128k_tokens: Optional[float]  # only for vertex ai models
     input_cost_per_character_above_128k_tokens: Optional[
@@ -454,6 +456,13 @@ class Choices(OpenAIObject):
 
 
 class Usage(CompletionUsage):
+    _cache_creation_input_tokens: int = PrivateAttr(
+        0
+    )  # hidden param for prompt caching. Might change, once openai introduces their equivalent.
+    _cache_read_input_tokens: int = PrivateAttr(
+        0
+    )  # hidden param for prompt caching. Might change, once openai introduces their equivalent.
+
     def __init__(
         self,
         prompt_tokens: Optional[int] = None,
@@ -466,9 +475,18 @@ class Usage(CompletionUsage):
             "completion_tokens": completion_tokens or 0,
             "total_tokens": total_tokens or 0,
         }
-
         super().__init__(**data)
 
+        if "cache_creation_input_tokens" in params and isinstance(
+            params["cache_creation_input_tokens"], int
+        ):
+            self._cache_creation_input_tokens = params["cache_creation_input_tokens"]
+
+        if "cache_read_input_tokens" in params and isinstance(
+            params["cache_read_input_tokens"], int
+        ):
+            self._cache_read_input_tokens = params["cache_read_input_tokens"]
+
         for k, v in params.items():
             setattr(self, k, v)
 
diff --git a/litellm/utils.py b/litellm/utils.py
index ec4ac79c0..bb50900d0 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -2550,6 +2550,7 @@ def get_optional_params_embeddings(
     encoding_format=None,
     dimensions=None,
     custom_llm_provider="",
+    drop_params: Optional[bool] = None,
     additional_drop_params: Optional[bool] = None,
     **kwargs,
 ):
@@ -2560,6 +2561,7 @@ def get_optional_params_embeddings(
     for k, v in special_params.items():
         passed_params[k] = v
 
+    drop_params = passed_params.pop("drop_params", None)
     additional_drop_params = passed_params.pop("additional_drop_params", None)
 
     default_params = {"user": None, "encoding_format": None, "dimensions": None}
@@ -2571,11 +2573,16 @@ def get_optional_params_embeddings(
         for k in non_default_params.keys():
             if k not in supported_params:
                 unsupported_params[k] = non_default_params[k]
-        if unsupported_params and not litellm.drop_params:
-            raise UnsupportedParamsError(
-                status_code=500,
-                message=f"{custom_llm_provider} does not support parameters: {unsupported_params}, for model={model}. To drop these, set `litellm.drop_params=True` or for proxy:\n\n`litellm_settings:\n drop_params: true`\n",
-            )
+        if unsupported_params:
+            if litellm.drop_params is True or (
+                drop_params is not None and drop_params is True
+            ):
+                pass
+            else:
+                raise UnsupportedParamsError(
+                    status_code=500,
+                    message=f"{custom_llm_provider} does not support parameters: {unsupported_params}, for model={model}. To drop these, set `litellm.drop_params=True` or for proxy:\n\n`litellm_settings:\n drop_params: true`\n",
+                )
 
     non_default_params = _get_non_default_params(
         passed_params=passed_params,
@@ -2680,7 +2687,9 @@ def get_optional_params_embeddings(
         and custom_llm_provider not in litellm.openai_compatible_providers
     ):
         if len(non_default_params.keys()) > 0:
-            if litellm.drop_params is True:  # drop the unsupported non-default values
+            if (
+                litellm.drop_params is True or drop_params is True
+            ):  # drop the unsupported non-default values
                 keys = list(non_default_params.keys())
                 for k in keys:
                     non_default_params.pop(k, None)
@@ -5335,6 +5344,12 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
                 max_input_tokens=_model_info.get("max_input_tokens", None),
                 max_output_tokens=_model_info.get("max_output_tokens", None),
                 input_cost_per_token=_input_cost_per_token,
+                cache_creation_input_token_cost=_model_info.get(
+                    "cache_creation_input_token_cost", None
+                ),
+                cache_read_input_token_cost=_model_info.get(
+                    "cache_read_input_token_cost", None
+                ),
                 input_cost_per_character=_model_info.get(
                     "input_cost_per_character", None
                 ),
diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json
index 5e6d0f2ab..a60743c65 100644
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@@ -1336,6 +1336,8 @@
         "max_output_tokens": 4096,
         "input_cost_per_token": 0.00000025,
         "output_cost_per_token": 0.00000125,
+        "cache_creation_input_token_cost": 0.0000003,
+        "cache_read_input_token_cost": 0.00000003,
         "litellm_provider": "anthropic",
         "mode": "chat",
         "supports_function_calling": true,
@@ -1349,6 +1351,8 @@
         "max_output_tokens": 4096,
         "input_cost_per_token": 0.000015,
         "output_cost_per_token": 0.000075,
+        "cache_creation_input_token_cost": 0.00001875,
+        "cache_read_input_token_cost": 0.0000015,
         "litellm_provider": "anthropic",
         "mode": "chat",
         "supports_function_calling": true,
@@ -1375,6 +1379,8 @@
         "max_output_tokens": 4096,
         "input_cost_per_token": 0.000003,
         "output_cost_per_token": 0.000015,
+        "cache_creation_input_token_cost": 0.00000375,
+        "cache_read_input_token_cost": 0.0000003,
         "litellm_provider": "anthropic",
         "mode": "chat",
         "supports_function_calling": true,

From 336022e97c5400f6c38950a540fda08b114161dc Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Sat, 31 Aug 2024 14:34:00 -0700
Subject: [PATCH 4/4] test: skip test on end of life model

---
 litellm/tests/test_streaming.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py
index d2ef8aafc..1b8b4e085 100644
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@@ -1545,6 +1545,7 @@ def test_completion_bedrock_claude_stream():
 # test_completion_bedrock_claude_stream()
 
 
+@pytest.mark.skip(reason="model end of life")
 def test_completion_bedrock_ai21_stream():
     try:
         litellm.set_verbose = False