From f2c0a31e3cb46c3d9586af454fef77adc670f64f Mon Sep 17 00:00:00 2001
From: Krish Dholakia <krrishdholakia@gmail.com>
Date: Sat, 5 Oct 2024 18:59:11 -0400
Subject: [PATCH] LiteLLM Minor Fixes & Improvements (10/05/2024)  (#6083)

* docs(prompt_caching.md): add prompt caching cost calc example to docs

* docs(prompt_caching.md): add proxy examples to docs

* feat(utils.py): expose new helper `supports_prompt_caching()` to check if a model supports prompt caching

* docs(prompt_caching.md): add docs on checking model support for prompt caching

* build: fix invalid json
---
 .../docs/completion/prompt_caching.md         | 305 +++++++++++++++++-
 ...odel_prices_and_context_window_backup.json |  69 ++--
 litellm/proxy/_new_secret_config.yaml         |  12 +-
 litellm/utils.py                              |  34 ++
 model_prices_and_context_window.json          |  69 ++--
 tests/local_testing/test_get_model_file.py    |  21 +-
 tests/local_testing/test_prompt_caching.py    |   8 +
 7 files changed, 459 insertions(+), 59 deletions(-)
diff --git a/docs/my-website/docs/completion/prompt_caching.md b/docs/my-website/docs/completion/prompt_caching.md
index 3a5537b52..5c795778e 100644
--- a/docs/my-website/docs/completion/prompt_caching.md
+++ b/docs/my-website/docs/completion/prompt_caching.md
@@ -1,3 +1,6 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
 # Prompt Caching 
 
 For OpenAI + Anthropic + Deepseek, LiteLLM follows the OpenAI prompt caching usage object format:
@@ -30,6 +33,9 @@ For OpenAI + Anthropic + Deepseek, LiteLLM follows the OpenAI prompt caching usa
 
 Note: OpenAI caching is only available for prompts containing 1024 tokens or more
 
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
 ```python
 from litellm import completion 
 import os
@@ -87,6 +93,90 @@ assert "prompt_tokens_details" in response.usage
 assert response.usage.prompt_tokens_details.cached_tokens > 0
 ```
 
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+    - model_name: gpt-4o
+      litellm_params:
+        model: openai/gpt-4o
+        api_key: os.environ/OPENAI_API_KEY
+```
+
+2. Start proxy 
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```python
+from openai import OpenAI 
+import os
+
+client = OpenAI(
+    api_key="LITELLM_PROXY_KEY", # sk-1234
+    base_url="LITELLM_PROXY_BASE" # http://0.0.0.0:4000
+)
+
+for _ in range(2):
+    response = client.chat.completions.create(
+        model="gpt-4o",
+        messages=[
+            # System Message
+            {
+                "role": "system",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "Here is the full text of a complex legal agreement"
+                        * 400,
+                    }
+                ],
+            },
+            # marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache.
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What are the key terms and conditions in this agreement?",
+                    }
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
+            },
+            # The final turn is marked with cache-control, for continuing in followups.
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What are the key terms and conditions in this agreement?",
+                    }
+                ],
+            },
+        ],
+        temperature=0.2,
+        max_tokens=10,
+    )
+
+print("response=", response)
+print("response.usage=", response.usage)
+
+assert "prompt_tokens_details" in response.usage
+assert response.usage.prompt_tokens_details.cached_tokens > 0
+```
+
+</TabItem>
+</Tabs>
+
 ### Anthropic Example 
 
 Anthropic charges for cache writes. 
@@ -95,6 +185,9 @@ Specify the content to cache with `"cache_control": {"type": "ephemeral"}`.
 
 If you pass that in for any other llm provider, it will be ignored. 
 
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
 ```python 
 from litellm import completion 
 import litellm 
@@ -129,6 +222,65 @@ response = completion(
 
 print(response.usage)
 ```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+    - model_name: claude-3-5-sonnet-20240620
+      litellm_params:
+        model: anthropic/claude-3-5-sonnet-20240620
+        api_key: os.environ/ANTHROPIC_API_KEY
+```
+
+2. Start proxy 
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```python 
+from openai import OpenAI 
+import os
+
+client = OpenAI(
+    api_key="LITELLM_PROXY_KEY", # sk-1234
+    base_url="LITELLM_PROXY_BASE" # http://0.0.0.0:4000
+)
+
+response = client.chat.completions.create(
+    model="claude-3-5-sonnet-20240620",
+    messages=[
+        {
+            "role": "system",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "You are an AI assistant tasked with analyzing legal documents.",
+                },
+                {
+                    "type": "text",
+                    "text": "Here is the full text of a complex legal agreement" * 400,
+                    "cache_control": {"type": "ephemeral"},
+                },
+            ],
+        },
+        {
+            "role": "user",
+            "content": "what are the key terms and conditions in this agreement?",
+        },
+    ]
+)
+
+print(response.usage)
+```
+
+</TabItem>
+</Tabs>
 
 ### Deepeek Example 
 
@@ -196,4 +348,155 @@ response_2 = litellm.completion(model=model_name, messages=message_2)
 
 # Add any assertions here to check the response
 print(response_2.usage)
-```
\ No newline at end of file
+```
+
+
+## Calculate Cost 
+
+Cost cache-hit prompt tokens can differ from cache-miss prompt tokens.
+
+Use the `completion_cost()` function for calculating cost ([handles prompt caching cost calculation](https://github.com/BerriAI/litellm/blob/f7ce1173f3315cc6cae06cf9bcf12e54a2a19705/litellm/llms/anthropic/cost_calculation.py#L12) as well). [**See more helper functions**](./token_usage.md)
+
+```python
+cost = completion_cost(completion_response=response, model=model)
+```
+
+### Usage
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion, completion_cost
+import litellm 
+import os 
+
+litellm.set_verbose = True # 👈 SEE RAW REQUEST
+os.environ["ANTHROPIC_API_KEY"] = "" 
+model = "anthropic/claude-3-5-sonnet-20240620"
+response = completion(
+    model=model,
+    messages=[
+        {
+            "role": "system",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "You are an AI assistant tasked with analyzing legal documents.",
+                },
+                {
+                    "type": "text",
+                    "text": "Here is the full text of a complex legal agreement" * 400,
+                    "cache_control": {"type": "ephemeral"},
+                },
+            ],
+        },
+        {
+            "role": "user",
+            "content": "what are the key terms and conditions in this agreement?",
+        },
+    ]
+)
+
+print(response.usage)
+
+cost = completion_cost(completion_response=response, model=model) 
+
+formatted_string = f"${float(cost):.10f}"
+print(formatted_string)
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+LiteLLM returns the calculated cost in the response headers - `x-litellm-response-cost` 
+
+```python
+from openai import OpenAI
+
+client = OpenAI(
+    api_key="LITELLM_PROXY_KEY", # sk-1234..
+    base_url="LITELLM_PROXY_BASE" # http://0.0.0.0:4000
+)
+response = client.chat.completions.with_raw_response.create(
+    messages=[{
+        "role": "user",
+        "content": "Say this is a test",
+    }],
+    model="gpt-3.5-turbo",
+)
+print(response.headers.get('x-litellm-response-cost'))
+
+completion = response.parse()  # get the object that `chat.completions.create()` would have returned
+print(completion)
+```
+
+</TabItem>
+</Tabs>
+
+## Check Model Support
+
+Check if a model supports prompt caching with `supports_prompt_caching()` 
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm.utils import supports_prompt_caching
+
+supports_pc: bool = supports_prompt_caching(model="anthropic/claude-3-5-sonnet-20240620")
+
+assert supports_pc
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+Use the `/model/info` endpoint to check if a model on the proxy supports prompt caching 
+
+1. Setup config.yaml 
+
+```yaml
+model_list:
+    - model_name: claude-3-5-sonnet-20240620
+      litellm_params:
+        model: anthropic/claude-3-5-sonnet-20240620
+        api_key: os.environ/ANTHROPIC_API_KEY
+```
+
+2. Start proxy 
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```bash
+curl -L -X GET 'http://0.0.0.0:4000/v1/model/info' \
+-H 'Authorization: Bearer sk-1234' \
+```
+
+**Expected Response**
+
+```bash
+{
+    "data": [
+        {
+            "model_name": "claude-3-5-sonnet-20240620",
+            "litellm_params": {
+                "model": "anthropic/claude-3-5-sonnet-20240620"
+            },
+            "model_info": {
+                "key": "claude-3-5-sonnet-20240620",
+                ...
+                "supports_prompt_caching": true # 👈 LOOK FOR THIS!
+            }
+        }
+    ]
+}
+```
+
+</TabItem>
+</Tabs>
+
+This checks our maintained [model info/cost map](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)
\ No newline at end of file
diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json
index 0fe6f6b87..7d060dd83 100644
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@@ -9,7 +9,8 @@
         "mode": "one of chat, embedding, completion, image_generation, audio_transcription, audio_speech",
         "supports_function_calling": true,
         "supports_parallel_function_calling": true,
-        "supports_vision": true
+        "supports_vision": true,
+        "supports_prompt_caching": true
     },
     "gpt-4": {
         "max_tokens": 4096, 
@@ -19,7 +20,8 @@
         "output_cost_per_token": 0.00006,
         "litellm_provider": "openai",
         "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
+        "supports_prompt_caching": true
     },
     "gpt-4o": {
         "max_tokens": 4096,
@@ -129,7 +131,8 @@
         "mode": "chat",
         "supports_function_calling": true,
         "supports_parallel_function_calling": true,
-        "supports_vision": true
+        "supports_vision": true,
+        "supports_prompt_caching": true
     },
     "gpt-4o-2024-05-13": {
         "max_tokens": 4096,
@@ -141,7 +144,8 @@
         "mode": "chat",
         "supports_function_calling": true,
         "supports_parallel_function_calling": true,
-        "supports_vision": true
+        "supports_vision": true,
+        "supports_prompt_caching": true
     },
     "gpt-4o-2024-08-06": {
         "max_tokens": 16384,
@@ -166,7 +170,8 @@
         "litellm_provider": "openai",
         "mode": "chat",
         "supports_function_calling": true,
-        "supports_parallel_function_calling": true
+        "supports_parallel_function_calling": true,
+        "supports_prompt_caching": true
     },
     "gpt-4-0314": {
         "max_tokens": 4096,
@@ -175,7 +180,8 @@
         "input_cost_per_token": 0.00003,
         "output_cost_per_token": 0.00006,
         "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_prompt_caching": true
     },
     "gpt-4-0613": {
         "max_tokens": 4096,
@@ -185,7 +191,8 @@
         "output_cost_per_token": 0.00006,
         "litellm_provider": "openai",
         "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
+        "supports_prompt_caching": true
     },
     "gpt-4-32k": {
         "max_tokens": 4096,
@@ -194,7 +201,8 @@
         "input_cost_per_token": 0.00006,
         "output_cost_per_token": 0.00012,
         "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_prompt_caching": true
     },
     "gpt-4-32k-0314": {
         "max_tokens": 4096,
@@ -203,7 +211,8 @@
         "input_cost_per_token": 0.00006,
         "output_cost_per_token": 0.00012,
         "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_prompt_caching": true
     },
     "gpt-4-32k-0613": {
         "max_tokens": 4096,
@@ -212,7 +221,8 @@
         "input_cost_per_token": 0.00006,
         "output_cost_per_token": 0.00012,
         "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_prompt_caching": true
     },
     "gpt-4-turbo": {
         "max_tokens": 4096,
@@ -224,7 +234,8 @@
         "mode": "chat",
         "supports_function_calling": true,
         "supports_parallel_function_calling": true,
-        "supports_vision": true
+        "supports_vision": true,
+        "supports_prompt_caching": true
     },
     "gpt-4-turbo-2024-04-09": {
         "max_tokens": 4096,
@@ -236,7 +247,8 @@
         "mode": "chat",
         "supports_function_calling": true,
         "supports_parallel_function_calling": true,
-        "supports_vision": true
+        "supports_vision": true,
+        "supports_prompt_caching": true
     },
     "gpt-4-1106-preview": {
         "max_tokens": 4096,
@@ -247,7 +259,8 @@
         "litellm_provider": "openai",
         "mode": "chat",
         "supports_function_calling": true,
-        "supports_parallel_function_calling": true
+        "supports_parallel_function_calling": true,
+        "supports_prompt_caching": true
     },
     "gpt-4-0125-preview": {
         "max_tokens": 4096,
@@ -258,7 +271,8 @@
         "litellm_provider": "openai",
         "mode": "chat",
         "supports_function_calling": true,
-        "supports_parallel_function_calling": true
+        "supports_parallel_function_calling": true,
+        "supports_prompt_caching": true
     },
     "gpt-4-vision-preview": {
         "max_tokens": 4096,
@@ -268,7 +282,8 @@
         "output_cost_per_token": 0.00003,
         "litellm_provider": "openai",
         "mode": "chat",
-        "supports_vision": true
+        "supports_vision": true,
+        "supports_prompt_caching": true
     },
     "gpt-4-1106-vision-preview": {
         "max_tokens": 4096,
@@ -278,7 +293,8 @@
         "output_cost_per_token": 0.00003,
         "litellm_provider": "openai",
         "mode": "chat",
-        "supports_vision": true
+        "supports_vision": true,
+        "supports_prompt_caching": true
     },
     "gpt-3.5-turbo": {
         "max_tokens": 4097,
@@ -288,7 +304,8 @@
         "output_cost_per_token": 0.000002,
         "litellm_provider": "openai",
         "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
+        "supports_prompt_caching": true
     },
     "gpt-3.5-turbo-0301": {
         "max_tokens": 4097,
@@ -297,7 +314,8 @@
         "input_cost_per_token": 0.0000015,
         "output_cost_per_token": 0.000002,
         "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_prompt_caching": true
     },
     "gpt-3.5-turbo-0613": {
         "max_tokens": 4097,
@@ -307,7 +325,8 @@
         "output_cost_per_token": 0.000002,
         "litellm_provider": "openai",
         "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
+        "supports_prompt_caching": true
     },
     "gpt-3.5-turbo-1106": {
         "max_tokens": 16385,
@@ -318,7 +337,8 @@
         "litellm_provider": "openai",
         "mode": "chat",
         "supports_function_calling": true,
-        "supports_parallel_function_calling": true
+        "supports_parallel_function_calling": true,
+        "supports_prompt_caching": true
     },
     "gpt-3.5-turbo-0125": {
         "max_tokens": 16385,
@@ -329,7 +349,8 @@
         "litellm_provider": "openai",
         "mode": "chat",
         "supports_function_calling": true,
-        "supports_parallel_function_calling": true
+        "supports_parallel_function_calling": true,
+        "supports_prompt_caching": true
     },
     "gpt-3.5-turbo-16k": {
         "max_tokens": 16385,
@@ -338,7 +359,8 @@
         "input_cost_per_token": 0.000003,
         "output_cost_per_token": 0.000004,
         "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_prompt_caching": true
     },
     "gpt-3.5-turbo-16k-0613": {
         "max_tokens": 16385,
@@ -347,7 +369,8 @@
         "input_cost_per_token": 0.000003,
         "output_cost_per_token": 0.000004,
         "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_prompt_caching": true
     },
     "ft:gpt-3.5-turbo": {
         "max_tokens": 4096,
diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml
index 2c910865a..e2afa3f12 100644
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@@ -1,9 +1,5 @@
 model_list:
-  - model_name: fake-openai-endpoint
-    litellm_params:
-      model: openai/fake
-      api_key: fake-key
-      api_base: https://exampleopenaiendpoint-production.up.railway.app/
-
-litellm_settings:
-  callbacks: ["gcs_bucket"]
+    - model_name: claude-3-5-sonnet-20240620
+      litellm_params:
+        model: anthropic/claude-3-5-sonnet-20240620
+        api_key: os.environ/ANTHROPIC_API_KEY
\ No newline at end of file
diff --git a/litellm/utils.py b/litellm/utils.py
index 753e07f80..cc52c1f56 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -2179,6 +2179,40 @@ def supports_function_calling(
         )
 
 
+def supports_prompt_caching(
+    model: str, custom_llm_provider: Optional[str] = None
+) -> bool:
+    """
+    Check if the given model supports prompt caching and return a boolean value.
+
+    Parameters:
+    model (str): The model name to be checked.
+    custom_llm_provider (Optional[str]): The provider to be checked.
+
+    Returns:
+    bool: True if the model supports prompt caching, False otherwise.
+
+    Raises:
+    Exception: If the given model is not found or there's an error in retrieval.
+    """
+    try:
+        model, custom_llm_provider, _, _ = litellm.get_llm_provider(
+            model=model, custom_llm_provider=custom_llm_provider
+        )
+
+        model_info = litellm.get_model_info(
+            model=model, custom_llm_provider=custom_llm_provider
+        )
+
+        if model_info.get("supports_prompt_caching", False) is True:
+            return True
+        return False
+    except Exception as e:
+        raise Exception(
+            f"Model not found or error in checking prompt caching support. You passed model={model}, custom_llm_provider={custom_llm_provider}. Error: {str(e)}"
+        )
+
+
 def supports_vision(model: str, custom_llm_provider: Optional[str] = None) -> bool:
     """
     Check if the given model supports vision and return a boolean value.
diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json
index 0fe6f6b87..7d060dd83 100644
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@@ -9,7 +9,8 @@
         "mode": "one of chat, embedding, completion, image_generation, audio_transcription, audio_speech",
         "supports_function_calling": true,
         "supports_parallel_function_calling": true,
-        "supports_vision": true
+        "supports_vision": true,
+        "supports_prompt_caching": true
     },
     "gpt-4": {
         "max_tokens": 4096, 
@@ -19,7 +20,8 @@
         "output_cost_per_token": 0.00006,
         "litellm_provider": "openai",
         "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
+        "supports_prompt_caching": true
     },
     "gpt-4o": {
         "max_tokens": 4096,
@@ -129,7 +131,8 @@
         "mode": "chat",
         "supports_function_calling": true,
         "supports_parallel_function_calling": true,
-        "supports_vision": true
+        "supports_vision": true,
+        "supports_prompt_caching": true
     },
     "gpt-4o-2024-05-13": {
         "max_tokens": 4096,
@@ -141,7 +144,8 @@
         "mode": "chat",
         "supports_function_calling": true,
         "supports_parallel_function_calling": true,
-        "supports_vision": true
+        "supports_vision": true,
+        "supports_prompt_caching": true
     },
     "gpt-4o-2024-08-06": {
         "max_tokens": 16384,
@@ -166,7 +170,8 @@
         "litellm_provider": "openai",
         "mode": "chat",
         "supports_function_calling": true,
-        "supports_parallel_function_calling": true
+        "supports_parallel_function_calling": true,
+        "supports_prompt_caching": true
     },
     "gpt-4-0314": {
         "max_tokens": 4096,
@@ -175,7 +180,8 @@
         "input_cost_per_token": 0.00003,
         "output_cost_per_token": 0.00006,
         "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_prompt_caching": true
     },
     "gpt-4-0613": {
         "max_tokens": 4096,
@@ -185,7 +191,8 @@
         "output_cost_per_token": 0.00006,
         "litellm_provider": "openai",
         "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
+        "supports_prompt_caching": true
     },
     "gpt-4-32k": {
         "max_tokens": 4096,
@@ -194,7 +201,8 @@
         "input_cost_per_token": 0.00006,
         "output_cost_per_token": 0.00012,
         "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_prompt_caching": true
     },
     "gpt-4-32k-0314": {
         "max_tokens": 4096,
@@ -203,7 +211,8 @@
         "input_cost_per_token": 0.00006,
         "output_cost_per_token": 0.00012,
         "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_prompt_caching": true
     },
     "gpt-4-32k-0613": {
         "max_tokens": 4096,
@@ -212,7 +221,8 @@
         "input_cost_per_token": 0.00006,
         "output_cost_per_token": 0.00012,
         "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_prompt_caching": true
     },
     "gpt-4-turbo": {
         "max_tokens": 4096,
@@ -224,7 +234,8 @@
         "mode": "chat",
         "supports_function_calling": true,
         "supports_parallel_function_calling": true,
-        "supports_vision": true
+        "supports_vision": true,
+        "supports_prompt_caching": true
     },
     "gpt-4-turbo-2024-04-09": {
         "max_tokens": 4096,
@@ -236,7 +247,8 @@
         "mode": "chat",
         "supports_function_calling": true,
         "supports_parallel_function_calling": true,
-        "supports_vision": true
+        "supports_vision": true,
+        "supports_prompt_caching": true
     },
     "gpt-4-1106-preview": {
         "max_tokens": 4096,
@@ -247,7 +259,8 @@
         "litellm_provider": "openai",
         "mode": "chat",
         "supports_function_calling": true,
-        "supports_parallel_function_calling": true
+        "supports_parallel_function_calling": true,
+        "supports_prompt_caching": true
     },
     "gpt-4-0125-preview": {
         "max_tokens": 4096,
@@ -258,7 +271,8 @@
         "litellm_provider": "openai",
         "mode": "chat",
         "supports_function_calling": true,
-        "supports_parallel_function_calling": true
+        "supports_parallel_function_calling": true,
+        "supports_prompt_caching": true
     },
     "gpt-4-vision-preview": {
         "max_tokens": 4096,
@@ -268,7 +282,8 @@
         "output_cost_per_token": 0.00003,
         "litellm_provider": "openai",
         "mode": "chat",
-        "supports_vision": true
+        "supports_vision": true,
+        "supports_prompt_caching": true
     },
     "gpt-4-1106-vision-preview": {
         "max_tokens": 4096,
@@ -278,7 +293,8 @@
         "output_cost_per_token": 0.00003,
         "litellm_provider": "openai",
         "mode": "chat",
-        "supports_vision": true
+        "supports_vision": true,
+        "supports_prompt_caching": true
     },
     "gpt-3.5-turbo": {
         "max_tokens": 4097,
@@ -288,7 +304,8 @@
         "output_cost_per_token": 0.000002,
         "litellm_provider": "openai",
         "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
+        "supports_prompt_caching": true
     },
     "gpt-3.5-turbo-0301": {
         "max_tokens": 4097,
@@ -297,7 +314,8 @@
         "input_cost_per_token": 0.0000015,
         "output_cost_per_token": 0.000002,
         "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_prompt_caching": true
     },
     "gpt-3.5-turbo-0613": {
         "max_tokens": 4097,
@@ -307,7 +325,8 @@
         "output_cost_per_token": 0.000002,
         "litellm_provider": "openai",
         "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
+        "supports_prompt_caching": true
     },
     "gpt-3.5-turbo-1106": {
         "max_tokens": 16385,
@@ -318,7 +337,8 @@
         "litellm_provider": "openai",
         "mode": "chat",
         "supports_function_calling": true,
-        "supports_parallel_function_calling": true
+        "supports_parallel_function_calling": true,
+        "supports_prompt_caching": true
     },
     "gpt-3.5-turbo-0125": {
         "max_tokens": 16385,
@@ -329,7 +349,8 @@
         "litellm_provider": "openai",
         "mode": "chat",
         "supports_function_calling": true,
-        "supports_parallel_function_calling": true
+        "supports_parallel_function_calling": true,
+        "supports_prompt_caching": true
     },
     "gpt-3.5-turbo-16k": {
         "max_tokens": 16385,
@@ -338,7 +359,8 @@
         "input_cost_per_token": 0.000003,
         "output_cost_per_token": 0.000004,
         "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_prompt_caching": true
     },
     "gpt-3.5-turbo-16k-0613": {
         "max_tokens": 16385,
@@ -347,7 +369,8 @@
         "input_cost_per_token": 0.000003,
         "output_cost_per_token": 0.000004,
         "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_prompt_caching": true
     },
     "ft:gpt-3.5-turbo": {
         "max_tokens": 4096,
diff --git a/tests/local_testing/test_get_model_file.py b/tests/local_testing/test_get_model_file.py
index 949ff43b8..17bd2d7ce 100644
--- a/tests/local_testing/test_get_model_file.py
+++ b/tests/local_testing/test_get_model_file.py
@@ -1,4 +1,6 @@
 import os, sys, traceback
+import importlib.resources
+import json
 
 sys.path.insert(
     0, os.path.abspath("../..")
@@ -6,7 +8,18 @@ sys.path.insert(
 import litellm
 import pytest
 
-try:
-    print(litellm.get_model_cost_map(url="fake-url"))
-except Exception as e:
-    pytest.fail(f"An exception occurred: {e}")
+
+def test_get_model_cost_map():
+    try:
+        print(litellm.get_model_cost_map(url="fake-url"))
+    except Exception as e:
+        pytest.fail(f"An exception occurred: {e}")
+
+
+def test_get_backup_model_cost_map():
+    with importlib.resources.open_text(
+        "litellm", "model_prices_and_context_window_backup.json"
+    ) as f:
+        print("inside backup")
+        content = json.load(f)
+        print("content", content)
diff --git a/tests/local_testing/test_prompt_caching.py b/tests/local_testing/test_prompt_caching.py
index 301ead3aa..35d5e2588 100644
--- a/tests/local_testing/test_prompt_caching.py
+++ b/tests/local_testing/test_prompt_caching.py
@@ -111,3 +111,11 @@ def test_prompt_caching_model(model):
     # assert (response.usage.cache_read_input_tokens > 0) or (
     #     response.usage.cache_creation_input_tokens > 0
     # )
+
+
+def test_supports_prompt_caching():
+    from litellm.utils import supports_prompt_caching
+
+    supports_pc = supports_prompt_caching(model="anthropic/claude-3-5-sonnet-20240620")
+
+    assert supports_pc