LiteLLM Minor Fixes & Improvements (10/05/2024) (#6083)

* docs(prompt_caching.md): add prompt caching cost calc example to docs * docs(prompt_caching.md): add proxy examples to docs * feat(utils.py): expose new helper `supports_prompt_caching()` to check if a model supports prompt caching * docs(prompt_caching.md): add docs on checking model support for prompt caching * build: fix invalid json
2024-10-05 18:59:11 -04:00 · 2024-10-05 18:59:11 -04:00 · f2c0a31e3c
commit f2c0a31e3c
parent fac3b2ee42
7 changed files with 459 additions and 59 deletions
--- a/docs/my-website/docs/completion/prompt_caching.md
+++ b/docs/my-website/docs/completion/prompt_caching.md
@ -1,3 +1,6 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
 # Prompt Caching 

 For OpenAI + Anthropic + Deepseek, LiteLLM follows the OpenAI prompt caching usage object format:
@ -30,6 +33,9 @@ For OpenAI + Anthropic + Deepseek, LiteLLM follows the OpenAI prompt caching usa

 Note: OpenAI caching is only available for prompts containing 1024 tokens or more

+<Tabs>
+<TabItem value="sdk" label="SDK">
+
 ```python
 from litellm import completion 
 import os
@ -87,6 +93,90 @@ assert "prompt_tokens_details" in response.usage
 assert response.usage.prompt_tokens_details.cached_tokens > 0
 ```

+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+    - model_name: gpt-4o
+      litellm_params:
+        model: openai/gpt-4o
+        api_key: os.environ/OPENAI_API_KEY
+```
+
+2. Start proxy 
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```python
+from openai import OpenAI 
+import os
+
+client = OpenAI(
+    api_key="LITELLM_PROXY_KEY", # sk-1234
+    base_url="LITELLM_PROXY_BASE" # http://0.0.0.0:4000
+)
+
+for _ in range(2):
+    response = client.chat.completions.create(
+        model="gpt-4o",
+        messages=[
+            # System Message
+            {
+                "role": "system",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "Here is the full text of a complex legal agreement"
+                        * 400,
+                    }
+                ],
+            },
+            # marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache.
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What are the key terms and conditions in this agreement?",
+                    }
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
+            },
+            # The final turn is marked with cache-control, for continuing in followups.
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What are the key terms and conditions in this agreement?",
+                    }
+                ],
+            },
+        ],
+        temperature=0.2,
+        max_tokens=10,
+    )
+
+print("response=", response)
+print("response.usage=", response.usage)
+
+assert "prompt_tokens_details" in response.usage
+assert response.usage.prompt_tokens_details.cached_tokens > 0
+```
+
+</TabItem>
+</Tabs>
+
 ### Anthropic Example 

 Anthropic charges for cache writes. 
@ -95,6 +185,9 @@ Specify the content to cache with `"cache_control": {"type": "ephemeral"}`.

 If you pass that in for any other llm provider, it will be ignored. 

+<Tabs>
+<TabItem value="sdk" label="SDK">
+
 ```python 
 from litellm import completion 
 import litellm 
@ -129,6 +222,65 @@ response = completion(

 print(response.usage)
 ```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+    - model_name: claude-3-5-sonnet-20240620
+      litellm_params:
+        model: anthropic/claude-3-5-sonnet-20240620
+        api_key: os.environ/ANTHROPIC_API_KEY
+```
+
+2. Start proxy 
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```python 
+from openai import OpenAI 
+import os
+
+client = OpenAI(
+    api_key="LITELLM_PROXY_KEY", # sk-1234
+    base_url="LITELLM_PROXY_BASE" # http://0.0.0.0:4000
+)
+
+response = client.chat.completions.create(
+    model="claude-3-5-sonnet-20240620",
+    messages=[
+        {
+            "role": "system",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "You are an AI assistant tasked with analyzing legal documents.",
+                },
+                {
+                    "type": "text",
+                    "text": "Here is the full text of a complex legal agreement" * 400,
+                    "cache_control": {"type": "ephemeral"},
+                },
+            ],
+        },
+        {
+            "role": "user",
+            "content": "what are the key terms and conditions in this agreement?",
+        },
+    ]
+)
+
+print(response.usage)
+```
+
+</TabItem>
+</Tabs>

 ### Deepeek Example 

@ -196,4 +348,155 @@ response_2 = litellm.completion(model=model_name, messages=message_2)

 # Add any assertions here to check the response
 print(response_2.usage)
-```
+```
+
+
+## Calculate Cost 
+
+Cost cache-hit prompt tokens can differ from cache-miss prompt tokens.
+
+Use the `completion_cost()` function for calculating cost ([handles prompt caching cost calculation](https://github.com/BerriAI/litellm/blob/f7ce1173f3315cc6cae06cf9bcf12e54a2a19705/litellm/llms/anthropic/cost_calculation.py#L12) as well). [**See more helper functions**](./token_usage.md)
+
+```python
+cost = completion_cost(completion_response=response, model=model)
+```
+
+### Usage
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion, completion_cost
+import litellm 
+import os 
+
+litellm.set_verbose = True # 👈 SEE RAW REQUEST
+os.environ["ANTHROPIC_API_KEY"] = "" 
+model = "anthropic/claude-3-5-sonnet-20240620"
+response = completion(
+    model=model,
+    messages=[
+        {
+            "role": "system",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "You are an AI assistant tasked with analyzing legal documents.",
+                },
+                {
+                    "type": "text",
+                    "text": "Here is the full text of a complex legal agreement" * 400,
+                    "cache_control": {"type": "ephemeral"},
+                },
+            ],
+        },
+        {
+            "role": "user",
+            "content": "what are the key terms and conditions in this agreement?",
+        },
+    ]
+)
+
+print(response.usage)
+
+cost = completion_cost(completion_response=response, model=model) 
+
+formatted_string = f"${float(cost):.10f}"
+print(formatted_string)
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+LiteLLM returns the calculated cost in the response headers - `x-litellm-response-cost` 
+
+```python
+from openai import OpenAI
+
+client = OpenAI(
+    api_key="LITELLM_PROXY_KEY", # sk-1234..
+    base_url="LITELLM_PROXY_BASE" # http://0.0.0.0:4000
+)
+response = client.chat.completions.with_raw_response.create(
+    messages=[{
+        "role": "user",
+        "content": "Say this is a test",
+    }],
+    model="gpt-3.5-turbo",
+)
+print(response.headers.get('x-litellm-response-cost'))
+
+completion = response.parse()  # get the object that `chat.completions.create()` would have returned
+print(completion)
+```
+
+</TabItem>
+</Tabs>
+
+## Check Model Support
+
+Check if a model supports prompt caching with `supports_prompt_caching()` 
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm.utils import supports_prompt_caching
+
+supports_pc: bool = supports_prompt_caching(model="anthropic/claude-3-5-sonnet-20240620")
+
+assert supports_pc
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+Use the `/model/info` endpoint to check if a model on the proxy supports prompt caching 
+
+1. Setup config.yaml 
+
+```yaml
+model_list:
+    - model_name: claude-3-5-sonnet-20240620
+      litellm_params:
+        model: anthropic/claude-3-5-sonnet-20240620
+        api_key: os.environ/ANTHROPIC_API_KEY
+```
+
+2. Start proxy 
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```bash
+curl -L -X GET 'http://0.0.0.0:4000/v1/model/info' \
+-H 'Authorization: Bearer sk-1234' \
+```
+
+**Expected Response**
+
+```bash
+{
+    "data": [
+        {
+            "model_name": "claude-3-5-sonnet-20240620",
+            "litellm_params": {
+                "model": "anthropic/claude-3-5-sonnet-20240620"
+            },
+            "model_info": {
+                "key": "claude-3-5-sonnet-20240620",
+                ...
+                "supports_prompt_caching": true # 👈 LOOK FOR THIS!
+            }
+        }
+    ]
+}
+```
+
+</TabItem>
+</Tabs>
+
+This checks our maintained [model info/cost map](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -9,7 +9,8 @@
        "mode": "one of chat, embedding, completion, image_generation, audio_transcription, audio_speech",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
-        "supports_vision": true
+        "supports_vision": true,
+        "supports_prompt_caching": true
    },
    "gpt-4": {
        "max_tokens": 4096, 
@ -19,7 +20,8 @@
        "output_cost_per_token": 0.00006,
        "litellm_provider": "openai",
        "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
+        "supports_prompt_caching": true
    },
    "gpt-4o": {
        "max_tokens": 4096,
@ -129,7 +131,8 @@
        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
-        "supports_vision": true
+        "supports_vision": true,
+        "supports_prompt_caching": true
    },
    "gpt-4o-2024-05-13": {
        "max_tokens": 4096,
@ -141,7 +144,8 @@
        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
-        "supports_vision": true
+        "supports_vision": true,
+        "supports_prompt_caching": true
    },
    "gpt-4o-2024-08-06": {
        "max_tokens": 16384,
@ -166,7 +170,8 @@
        "litellm_provider": "openai",
        "mode": "chat",
        "supports_function_calling": true,
-        "supports_parallel_function_calling": true
+        "supports_parallel_function_calling": true,
+        "supports_prompt_caching": true
    },
    "gpt-4-0314": {
        "max_tokens": 4096,
@ -175,7 +180,8 @@
        "input_cost_per_token": 0.00003,
        "output_cost_per_token": 0.00006,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_prompt_caching": true
    },
    "gpt-4-0613": {
        "max_tokens": 4096,
@ -185,7 +191,8 @@
        "output_cost_per_token": 0.00006,
        "litellm_provider": "openai",
        "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
+        "supports_prompt_caching": true
    },
    "gpt-4-32k": {
        "max_tokens": 4096,
@ -194,7 +201,8 @@
        "input_cost_per_token": 0.00006,
        "output_cost_per_token": 0.00012,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_prompt_caching": true
    },
    "gpt-4-32k-0314": {
        "max_tokens": 4096,
@ -203,7 +211,8 @@
        "input_cost_per_token": 0.00006,
        "output_cost_per_token": 0.00012,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_prompt_caching": true
    },
    "gpt-4-32k-0613": {
        "max_tokens": 4096,
@ -212,7 +221,8 @@
        "input_cost_per_token": 0.00006,
        "output_cost_per_token": 0.00012,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_prompt_caching": true
    },
    "gpt-4-turbo": {
        "max_tokens": 4096,
@ -224,7 +234,8 @@
        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
-        "supports_vision": true
+        "supports_vision": true,
+        "supports_prompt_caching": true
    },
    "gpt-4-turbo-2024-04-09": {
        "max_tokens": 4096,
@ -236,7 +247,8 @@
        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
-        "supports_vision": true
+        "supports_vision": true,
+        "supports_prompt_caching": true
    },
    "gpt-4-1106-preview": {
        "max_tokens": 4096,
@ -247,7 +259,8 @@
        "litellm_provider": "openai",
        "mode": "chat",
        "supports_function_calling": true,
-        "supports_parallel_function_calling": true
+        "supports_parallel_function_calling": true,
+        "supports_prompt_caching": true
    },
    "gpt-4-0125-preview": {
        "max_tokens": 4096,
@ -258,7 +271,8 @@
        "litellm_provider": "openai",
        "mode": "chat",
        "supports_function_calling": true,
-        "supports_parallel_function_calling": true
+        "supports_parallel_function_calling": true,
+        "supports_prompt_caching": true
    },
    "gpt-4-vision-preview": {
        "max_tokens": 4096,
@ -268,7 +282,8 @@
        "output_cost_per_token": 0.00003,
        "litellm_provider": "openai",
        "mode": "chat",
-        "supports_vision": true
+        "supports_vision": true,
+        "supports_prompt_caching": true
    },
    "gpt-4-1106-vision-preview": {
        "max_tokens": 4096,
@ -278,7 +293,8 @@
        "output_cost_per_token": 0.00003,
        "litellm_provider": "openai",
        "mode": "chat",
-        "supports_vision": true
+        "supports_vision": true,
+        "supports_prompt_caching": true
    },
    "gpt-3.5-turbo": {
        "max_tokens": 4097,
@ -288,7 +304,8 @@
        "output_cost_per_token": 0.000002,
        "litellm_provider": "openai",
        "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
+        "supports_prompt_caching": true
    },
    "gpt-3.5-turbo-0301": {
        "max_tokens": 4097,
@ -297,7 +314,8 @@
        "input_cost_per_token": 0.0000015,
        "output_cost_per_token": 0.000002,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_prompt_caching": true
    },
    "gpt-3.5-turbo-0613": {
        "max_tokens": 4097,
@ -307,7 +325,8 @@
        "output_cost_per_token": 0.000002,
        "litellm_provider": "openai",
        "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
+        "supports_prompt_caching": true
    },
    "gpt-3.5-turbo-1106": {
        "max_tokens": 16385,
@ -318,7 +337,8 @@
        "litellm_provider": "openai",
        "mode": "chat",
        "supports_function_calling": true,
-        "supports_parallel_function_calling": true
+        "supports_parallel_function_calling": true,
+        "supports_prompt_caching": true
    },
    "gpt-3.5-turbo-0125": {
        "max_tokens": 16385,
@ -329,7 +349,8 @@
        "litellm_provider": "openai",
        "mode": "chat",
        "supports_function_calling": true,
-        "supports_parallel_function_calling": true
+        "supports_parallel_function_calling": true,
+        "supports_prompt_caching": true
    },
    "gpt-3.5-turbo-16k": {
        "max_tokens": 16385,
@ -338,7 +359,8 @@
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000004,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_prompt_caching": true
    },
    "gpt-3.5-turbo-16k-0613": {
        "max_tokens": 16385,
@ -347,7 +369,8 @@
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000004,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_prompt_caching": true
    },
    "ft:gpt-3.5-turbo": {
        "max_tokens": 4096,
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@ -1,9 +1,5 @@
 model_list:
-  - model_name: fake-openai-endpoint
-    litellm_params:
-      model: openai/fake
-      api_key: fake-key
-      api_base: https://exampleopenaiendpoint-production.up.railway.app/
-
-litellm_settings:
-  callbacks: ["gcs_bucket"]
+    - model_name: claude-3-5-sonnet-20240620
+      litellm_params:
+        model: anthropic/claude-3-5-sonnet-20240620
+        api_key: os.environ/ANTHROPIC_API_KEY
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -2179,6 +2179,40 @@ def supports_function_calling(
        )


+def supports_prompt_caching(
+    model: str, custom_llm_provider: Optional[str] = None
+) -> bool:
+    """
+    Check if the given model supports prompt caching and return a boolean value.
+
+    Parameters:
+    model (str): The model name to be checked.
+    custom_llm_provider (Optional[str]): The provider to be checked.
+
+    Returns:
+    bool: True if the model supports prompt caching, False otherwise.
+
+    Raises:
+    Exception: If the given model is not found or there's an error in retrieval.
+    """
+    try:
+        model, custom_llm_provider, _, _ = litellm.get_llm_provider(
+            model=model, custom_llm_provider=custom_llm_provider
+        )
+
+        model_info = litellm.get_model_info(
+            model=model, custom_llm_provider=custom_llm_provider
+        )
+
+        if model_info.get("supports_prompt_caching", False) is True:
+            return True
+        return False
+    except Exception as e:
+        raise Exception(
+            f"Model not found or error in checking prompt caching support. You passed model={model}, custom_llm_provider={custom_llm_provider}. Error: {str(e)}"
+        )
+
+
 def supports_vision(model: str, custom_llm_provider: Optional[str] = None) -> bool:
    """
    Check if the given model supports vision and return a boolean value.
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -9,7 +9,8 @@
        "mode": "one of chat, embedding, completion, image_generation, audio_transcription, audio_speech",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
-        "supports_vision": true
+        "supports_vision": true,
+        "supports_prompt_caching": true
    },
    "gpt-4": {
        "max_tokens": 4096, 
@ -19,7 +20,8 @@
        "output_cost_per_token": 0.00006,
        "litellm_provider": "openai",
        "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
+        "supports_prompt_caching": true
    },
    "gpt-4o": {
        "max_tokens": 4096,
@ -129,7 +131,8 @@
        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
-        "supports_vision": true
+        "supports_vision": true,
+        "supports_prompt_caching": true
    },
    "gpt-4o-2024-05-13": {
        "max_tokens": 4096,
@ -141,7 +144,8 @@
        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
-        "supports_vision": true
+        "supports_vision": true,
+        "supports_prompt_caching": true
    },
    "gpt-4o-2024-08-06": {
        "max_tokens": 16384,
@ -166,7 +170,8 @@
        "litellm_provider": "openai",
        "mode": "chat",
        "supports_function_calling": true,
-        "supports_parallel_function_calling": true
+        "supports_parallel_function_calling": true,
+        "supports_prompt_caching": true
    },
    "gpt-4-0314": {
        "max_tokens": 4096,
@ -175,7 +180,8 @@
        "input_cost_per_token": 0.00003,
        "output_cost_per_token": 0.00006,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_prompt_caching": true
    },
    "gpt-4-0613": {
        "max_tokens": 4096,
@ -185,7 +191,8 @@
        "output_cost_per_token": 0.00006,
        "litellm_provider": "openai",
        "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
+        "supports_prompt_caching": true
    },
    "gpt-4-32k": {
        "max_tokens": 4096,
@ -194,7 +201,8 @@
        "input_cost_per_token": 0.00006,
        "output_cost_per_token": 0.00012,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_prompt_caching": true
    },
    "gpt-4-32k-0314": {
        "max_tokens": 4096,
@ -203,7 +211,8 @@
        "input_cost_per_token": 0.00006,
        "output_cost_per_token": 0.00012,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_prompt_caching": true
    },
    "gpt-4-32k-0613": {
        "max_tokens": 4096,
@ -212,7 +221,8 @@
        "input_cost_per_token": 0.00006,
        "output_cost_per_token": 0.00012,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_prompt_caching": true
    },
    "gpt-4-turbo": {
        "max_tokens": 4096,
@ -224,7 +234,8 @@
        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
-        "supports_vision": true
+        "supports_vision": true,
+        "supports_prompt_caching": true
    },
    "gpt-4-turbo-2024-04-09": {
        "max_tokens": 4096,
@ -236,7 +247,8 @@
        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
-        "supports_vision": true
+        "supports_vision": true,
+        "supports_prompt_caching": true
    },
    "gpt-4-1106-preview": {
        "max_tokens": 4096,
@ -247,7 +259,8 @@
        "litellm_provider": "openai",
        "mode": "chat",
        "supports_function_calling": true,
-        "supports_parallel_function_calling": true
+        "supports_parallel_function_calling": true,
+        "supports_prompt_caching": true
    },
    "gpt-4-0125-preview": {
        "max_tokens": 4096,
@ -258,7 +271,8 @@
        "litellm_provider": "openai",
        "mode": "chat",
        "supports_function_calling": true,
-        "supports_parallel_function_calling": true
+        "supports_parallel_function_calling": true,
+        "supports_prompt_caching": true
    },
    "gpt-4-vision-preview": {
        "max_tokens": 4096,
@ -268,7 +282,8 @@
        "output_cost_per_token": 0.00003,
        "litellm_provider": "openai",
        "mode": "chat",
-        "supports_vision": true
+        "supports_vision": true,
+        "supports_prompt_caching": true
    },
    "gpt-4-1106-vision-preview": {
        "max_tokens": 4096,
@ -278,7 +293,8 @@
        "output_cost_per_token": 0.00003,
        "litellm_provider": "openai",
        "mode": "chat",
-        "supports_vision": true
+        "supports_vision": true,
+        "supports_prompt_caching": true
    },
    "gpt-3.5-turbo": {
        "max_tokens": 4097,
@ -288,7 +304,8 @@
        "output_cost_per_token": 0.000002,
        "litellm_provider": "openai",
        "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
+        "supports_prompt_caching": true
    },
    "gpt-3.5-turbo-0301": {
        "max_tokens": 4097,
@ -297,7 +314,8 @@
        "input_cost_per_token": 0.0000015,
        "output_cost_per_token": 0.000002,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_prompt_caching": true
    },
    "gpt-3.5-turbo-0613": {
        "max_tokens": 4097,
@ -307,7 +325,8 @@
        "output_cost_per_token": 0.000002,
        "litellm_provider": "openai",
        "mode": "chat",
-        "supports_function_calling": true
+        "supports_function_calling": true,
+        "supports_prompt_caching": true
    },
    "gpt-3.5-turbo-1106": {
        "max_tokens": 16385,
@ -318,7 +337,8 @@
        "litellm_provider": "openai",
        "mode": "chat",
        "supports_function_calling": true,
-        "supports_parallel_function_calling": true
+        "supports_parallel_function_calling": true,
+        "supports_prompt_caching": true
    },
    "gpt-3.5-turbo-0125": {
        "max_tokens": 16385,
@ -329,7 +349,8 @@
        "litellm_provider": "openai",
        "mode": "chat",
        "supports_function_calling": true,
-        "supports_parallel_function_calling": true
+        "supports_parallel_function_calling": true,
+        "supports_prompt_caching": true
    },
    "gpt-3.5-turbo-16k": {
        "max_tokens": 16385,
@ -338,7 +359,8 @@
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000004,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_prompt_caching": true
    },
    "gpt-3.5-turbo-16k-0613": {
        "max_tokens": 16385,
@ -347,7 +369,8 @@
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000004,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_prompt_caching": true
    },
    "ft:gpt-3.5-turbo": {
        "max_tokens": 4096,
--- a/tests/local_testing/test_get_model_file.py
+++ b/tests/local_testing/test_get_model_file.py
@ -1,4 +1,6 @@
 import os, sys, traceback
+import importlib.resources
+import json

 sys.path.insert(
    0, os.path.abspath("../..")
@ -6,7 +8,18 @@ sys.path.insert(
 import litellm
 import pytest

-try:
-    print(litellm.get_model_cost_map(url="fake-url"))
-except Exception as e:
-    pytest.fail(f"An exception occurred: {e}")
+
+def test_get_model_cost_map():
+    try:
+        print(litellm.get_model_cost_map(url="fake-url"))
+    except Exception as e:
+        pytest.fail(f"An exception occurred: {e}")
+
+
+def test_get_backup_model_cost_map():
+    with importlib.resources.open_text(
+        "litellm", "model_prices_and_context_window_backup.json"
+    ) as f:
+        print("inside backup")
+        content = json.load(f)
+        print("content", content)
--- a/tests/local_testing/test_prompt_caching.py
+++ b/tests/local_testing/test_prompt_caching.py
@ -111,3 +111,11 @@ def test_prompt_caching_model(model):
    # assert (response.usage.cache_read_input_tokens > 0) or (
    #     response.usage.cache_creation_input_tokens > 0
    # )
+
+
+def test_supports_prompt_caching():
+    from litellm.utils import supports_prompt_caching
+
+    supports_pc = supports_prompt_caching(model="anthropic/claude-3-5-sonnet-20240620")
+
+    assert supports_pc