Merge branch 'main' into litellm_azure_ai_openai_support

2025-04-27 11:43:54 +00:00 · 2024-08-14 17:53:27 -07:00 · 2024-08-14 17:53:27 -07:00 · bda1ee16a9
commit bda1ee16a9
parent 43b90c0b86 3da4b07140
34 changed files with 1805 additions and 180 deletions
--- a/5
+++ b/5
@ -62,6 +62,11 @@ COPY --from=builder /wheels/ /wheels/
 RUN pip install *.whl /wheels/* --no-index --find-links=/wheels/ && rm -f *.whl && rm -rf /wheels

 # Generate prisma client
+ENV PRISMA_BINARY_CACHE_DIR=/app/prisma
+RUN mkdir -p /.cache
+RUN chmod -R 777 /.cache
+RUN pip install nodejs-bin
+RUN pip install prisma
 RUN prisma generate
 RUN chmod +x entrypoint.sh

--- a/Dockerfile.database
+++ b/Dockerfile.database
@ -62,6 +62,11 @@ RUN pip install PyJWT --no-cache-dir
 RUN chmod +x build_admin_ui.sh && ./build_admin_ui.sh

 # Generate prisma client
+ENV PRISMA_BINARY_CACHE_DIR=/app/prisma
+RUN mkdir -p /.cache
+RUN chmod -R 777 /.cache
+RUN pip install nodejs-bin
+RUN pip install prisma
 RUN prisma generate
 RUN chmod +x entrypoint.sh

--- a/docs/my-website/docs/providers/anthropic.md
+++ b/docs/my-website/docs/providers/anthropic.md
@ -225,22 +225,336 @@ print(response)
 | claude-instant-1.2  | `completion('claude-instant-1.2', messages)` | `os.environ['ANTHROPIC_API_KEY']`       |
 | claude-instant-1  | `completion('claude-instant-1', messages)` | `os.environ['ANTHROPIC_API_KEY']`       |

-## Passing Extra Headers to Anthropic API 
+## **Prompt Caching**

-Pass `extra_headers: dict` to `litellm.completion`
+Use Anthropic Prompt Caching
+
+
+[Relevant Anthropic API Docs](https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching)
+
+### Caching - Large Context Caching 
+
+This example demonstrates basic Prompt Caching usage, caching the full text of the legal agreement as a prefix while keeping the user instruction uncached.
+
+<Tabs>
+<TabItem value="sdk" label="LiteLLM SDK">

 ```python 
-from litellm import completion
-messages = [{"role": "user", "content": "What is Anthropic?"}]
-response = completion(
-    model="claude-3-5-sonnet-20240620", 
-    messages=messages, 
-    extra_headers={"anthropic-beta": "max-tokens-3-5-sonnet-2024-07-15"}
+response = await litellm.acompletion(
+    model="anthropic/claude-3-5-sonnet-20240620",
+    messages=[
+        {
+            "role": "system",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "You are an AI assistant tasked with analyzing legal documents.",
+                },
+                {
+                    "type": "text",
+                    "text": "Here is the full text of a complex legal agreement",
+                    "cache_control": {"type": "ephemeral"},
+                },
+            ],
+        },
+        {
+            "role": "user",
+            "content": "what are the key terms and conditions in this agreement?",
+        },
+    ],
+    extra_headers={
+        "anthropic-version": "2023-06-01",
+        "anthropic-beta": "prompt-caching-2024-07-31",
+    },
+)
+
+```
+</TabItem>
+<TabItem value="proxy" label="LiteLLM Proxy">
+
+:::info
+
+LiteLLM Proxy is OpenAI compatible
+
+This is an example using the OpenAI Python SDK sending a request to LiteLLM Proxy
+
+Assuming you have a model=`anthropic/claude-3-5-sonnet-20240620` on the [litellm proxy config.yaml](#usage-with-litellm-proxy)
+
+:::
+
+```python 
+import openai
+client = openai.AsyncOpenAI(
+    api_key="anything",            # litellm proxy api key
+    base_url="http://0.0.0.0:4000" # litellm proxy base url
+)
+
+
+response = await client.chat.completions.create(
+    model="anthropic/claude-3-5-sonnet-20240620",
+    messages=[
+        {
+            "role": "system",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "You are an AI assistant tasked with analyzing legal documents.",
+                },
+                {
+                    "type": "text",
+                    "text": "Here is the full text of a complex legal agreement",
+                    "cache_control": {"type": "ephemeral"},
+                },
+            ],
+        },
+        {
+            "role": "user",
+            "content": "what are the key terms and conditions in this agreement?",
+        },
+    ],
+    extra_headers={
+        "anthropic-version": "2023-06-01",
+        "anthropic-beta": "prompt-caching-2024-07-31",
+    },
+)
+
+```
+
+</TabItem>
+</Tabs>
+
+### Caching - Tools definitions
+
+In this example, we demonstrate caching tool definitions.
+
+The cache_control parameter is placed on the final tool
+
+<Tabs>
+<TabItem value="sdk" label="LiteLLM SDK">
+
+```python 
+import litellm
+
+response = await litellm.acompletion(
+    model="anthropic/claude-3-5-sonnet-20240620",
+    messages = [{"role": "user", "content": "What's the weather like in Boston today?"}]
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_current_weather",
+                "description": "Get the current weather in a given location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "The city and state, e.g. San Francisco, CA",
+                        },
+                        "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+                    },
+                    "required": ["location"],
+                },
+                "cache_control": {"type": "ephemeral"}
+            },
+        }
+    ],
+    extra_headers={
+        "anthropic-version": "2023-06-01",
+        "anthropic-beta": "prompt-caching-2024-07-31",
+    },
 )
 ```
-## Advanced
+</TabItem>
+<TabItem value="proxy" label="LiteLLM Proxy">

-## Usage - Function Calling 
+:::info
+
+LiteLLM Proxy is OpenAI compatible
+
+This is an example using the OpenAI Python SDK sending a request to LiteLLM Proxy
+
+Assuming you have a model=`anthropic/claude-3-5-sonnet-20240620` on the [litellm proxy config.yaml](#usage-with-litellm-proxy)
+
+:::
+
+```python 
+import openai
+client = openai.AsyncOpenAI(
+    api_key="anything",            # litellm proxy api key
+    base_url="http://0.0.0.0:4000" # litellm proxy base url
+)
+
+response = await client.chat.completions.create(
+    model="anthropic/claude-3-5-sonnet-20240620",
+    messages = [{"role": "user", "content": "What's the weather like in Boston today?"}]
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_current_weather",
+                "description": "Get the current weather in a given location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "The city and state, e.g. San Francisco, CA",
+                        },
+                        "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+                    },
+                    "required": ["location"],
+                },
+                "cache_control": {"type": "ephemeral"}
+            },
+        }
+    ],
+    extra_headers={
+        "anthropic-version": "2023-06-01",
+        "anthropic-beta": "prompt-caching-2024-07-31",
+    },
+)
+```
+
+</TabItem>
+</Tabs>
+
+
+### Caching - Continuing Multi-Turn Convo
+
+In this example, we demonstrate how to use Prompt Caching in a multi-turn conversation.
+
+The cache_control parameter is placed on the system message to designate it as part of the static prefix.
+
+The conversation history (previous messages) is included in the messages array. The final turn is marked with cache-control, for continuing in followups. The second-to-last user message is marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache.
+
+<Tabs>
+<TabItem value="sdk" label="LiteLLM SDK">
+
+```python 
+import litellm
+
+response = await litellm.acompletion(
+    model="anthropic/claude-3-5-sonnet-20240620",
+    messages=[
+        # System Message
+        {
+            "role": "system",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "Here is the full text of a complex legal agreement"
+                    * 400,
+                    "cache_control": {"type": "ephemeral"},
+                }
+            ],
+        },
+        # marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache.
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What are the key terms and conditions in this agreement?",
+                    "cache_control": {"type": "ephemeral"},
+                }
+            ],
+        },
+        {
+            "role": "assistant",
+            "content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
+        },
+        # The final turn is marked with cache-control, for continuing in followups.
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What are the key terms and conditions in this agreement?",
+                    "cache_control": {"type": "ephemeral"},
+                }
+            ],
+        },
+    ],
+    extra_headers={
+        "anthropic-version": "2023-06-01",
+        "anthropic-beta": "prompt-caching-2024-07-31",
+    },
+)
+```
+</TabItem>
+<TabItem value="proxy" label="LiteLLM Proxy">
+
+:::info
+
+LiteLLM Proxy is OpenAI compatible
+
+This is an example using the OpenAI Python SDK sending a request to LiteLLM Proxy
+
+Assuming you have a model=`anthropic/claude-3-5-sonnet-20240620` on the [litellm proxy config.yaml](#usage-with-litellm-proxy)
+
+:::
+
+```python 
+import openai
+client = openai.AsyncOpenAI(
+    api_key="anything",            # litellm proxy api key
+    base_url="http://0.0.0.0:4000" # litellm proxy base url
+)
+
+response = await client.chat.completions.create(
+    model="anthropic/claude-3-5-sonnet-20240620",
+    messages=[
+        # System Message
+        {
+            "role": "system",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "Here is the full text of a complex legal agreement"
+                    * 400,
+                    "cache_control": {"type": "ephemeral"},
+                }
+            ],
+        },
+        # marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache.
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What are the key terms and conditions in this agreement?",
+                    "cache_control": {"type": "ephemeral"},
+                }
+            ],
+        },
+        {
+            "role": "assistant",
+            "content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
+        },
+        # The final turn is marked with cache-control, for continuing in followups.
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What are the key terms and conditions in this agreement?",
+                    "cache_control": {"type": "ephemeral"},
+                }
+            ],
+        },
+    ],
+    extra_headers={
+        "anthropic-version": "2023-06-01",
+        "anthropic-beta": "prompt-caching-2024-07-31",
+    },
+)
+```
+
+</TabItem>
+</Tabs>
+
+## **Function/Tool Calling**

 :::info 

@ -429,6 +743,20 @@ resp = litellm.completion(
 print(f"\nResponse: {resp}")
 ```

+## **Passing Extra Headers to Anthropic API**
+
+Pass `extra_headers: dict` to `litellm.completion`
+
+```python
+from litellm import completion
+messages = [{"role": "user", "content": "What is Anthropic?"}]
+response = completion(
+    model="claude-3-5-sonnet-20240620", 
+    messages=messages, 
+    extra_headers={"anthropic-beta": "max-tokens-3-5-sonnet-2024-07-15"}
+)
+```
+
 ## Usage - "Assistant Pre-fill"

 You can "put words in Claude's mouth" by including an `assistant` role message as the last item in the `messages` array.
--- a/docs/my-website/docs/proxy/model_management.md
+++ b/docs/my-website/docs/proxy/model_management.md
@ -17,7 +17,7 @@ model_list:

 ## Get Model Information - `/model/info`

-Retrieve detailed information about each model listed in the `/model/info` endpoint, including descriptions from the `config.yaml` file, and additional model info (e.g. max tokens, cost per input token, etc.) pulled the model_info you set and the litellm model cost map. Sensitive details like API keys are excluded for security purposes.
+Retrieve detailed information about each model listed in the `/model/info` endpoint, including descriptions from the `config.yaml` file, and additional model info (e.g. max tokens, cost per input token, etc.) pulled from the model_info you set and the [litellm model cost map](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json). Sensitive details like API keys are excluded for security purposes.

 <Tabs
  defaultValue="curl"
@ -35,14 +35,10 @@ curl -X GET "http://0.0.0.0:4000/model/info" \

 ## Add a New Model

-Add a new model to the list in the `config.yaml` by providing the model parameters. This allows you to update the model list without restarting the proxy.
+Add a new model to the proxy via the `/model/new` API, to add models without restarting the proxy.

-<Tabs
-  defaultValue="curl"
-  values={[
-    { label: 'cURL', value: 'curl', },
-  ]}>
-  <TabItem value="curl">
+<Tabs>
+<TabItem value="API">

 ```bash
 curl -X POST "http://0.0.0.0:4000/model/new" \
@ -50,6 +46,21 @@ curl -X POST "http://0.0.0.0:4000/model/new" \
    -H "Content-Type: application/json" \
    -d '{ "model_name": "azure-gpt-turbo", "litellm_params": {"model": "azure/gpt-3.5-turbo", "api_key": "os.environ/AZURE_API_KEY", "api_base": "my-azure-api-base"} }'
 ```
+</TabItem>
+<TabItem value="Yaml">
+
+```yaml
+model_list:
+  - model_name: gpt-3.5-turbo ### RECEIVED MODEL NAME ### `openai.chat.completions.create(model="gpt-3.5-turbo",...)`
+    litellm_params: # all params accepted by litellm.completion() - https://github.com/BerriAI/litellm/blob/9b46ec05b02d36d6e4fb5c32321e51e7f56e4a6e/litellm/types/router.py#L297
+      model: azure/gpt-turbo-small-eu ### MODEL NAME sent to `litellm.completion()` ###
+      api_base: https://my-endpoint-europe-berri-992.openai.azure.com/
+      api_key: "os.environ/AZURE_API_KEY_EU" # does os.getenv("AZURE_API_KEY_EU")
+      rpm: 6      # [OPTIONAL] Rate limit for this deployment: in requests per minute (rpm)
+    model_info: 
+      my_custom_key: my_custom_value # additional model metadata
+```
+
 </TabItem>
 </Tabs>

@ -86,3 +97,82 @@ Keep in mind that as both endpoints are in [BETA], you may need to visit the ass
 - Add a New Model: [Issue #964](https://github.com/BerriAI/litellm/issues/964)

 Feedback on the beta endpoints is valuable and helps improve the API for all users.
+
+
+## Add Additional Model Information 
+
+If you want the ability to add a display name, description, and labels for models, just use `model_info:` 
+
+```yaml
+model_list:
+  - model_name: "gpt-4"
+    litellm_params:
+      model: "gpt-4"
+      api_key: "os.environ/OPENAI_API_KEY"
+    model_info: # 👈 KEY CHANGE
+      my_custom_key: "my_custom_value"
+```
+
+### Usage
+
+1. Add additional information to model 
+
+```yaml
+model_list:
+  - model_name: "gpt-4"
+    litellm_params:
+      model: "gpt-4"
+      api_key: "os.environ/OPENAI_API_KEY"
+    model_info: # 👈 KEY CHANGE
+      my_custom_key: "my_custom_value"
+```
+
+2. Call with `/model/info` 
+
+Use a key with access to the model `gpt-4`.
+
+```bash
+curl -L -X GET 'http://0.0.0.0:4000/v1/model/info' \
+-H 'Authorization: Bearer LITELLM_KEY' \
+```
+
+3. **Expected Response**
+
+Returned `model_info = Your custom model_info + (if exists) LITELLM MODEL INFO`
+
+
+[**How LiteLLM Model Info is found**](https://github.com/BerriAI/litellm/blob/9b46ec05b02d36d6e4fb5c32321e51e7f56e4a6e/litellm/proxy/proxy_server.py#L7460) 
+
+[Tell us how this can be improved!](https://github.com/BerriAI/litellm/issues)
+
+```bash
+{
+    "data": [
+        {
+            "model_name": "gpt-4",
+            "litellm_params": {
+                "model": "gpt-4"
+            },
+            "model_info": {
+                "id": "e889baacd17f591cce4c63639275ba5e8dc60765d6c553e6ee5a504b19e50ddc",
+                "db_model": false,
+                "my_custom_key": "my_custom_value", # 👈 CUSTOM INFO
+                "key": "gpt-4", # 👈 KEY in LiteLLM MODEL INFO/COST MAP - https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json
+                "max_tokens": 4096,
+                "max_input_tokens": 8192,
+                "max_output_tokens": 4096,
+                "input_cost_per_token": 3e-05,
+                "input_cost_per_character": null,
+                "input_cost_per_token_above_128k_tokens": null,
+                "output_cost_per_token": 6e-05,
+                "output_cost_per_character": null,
+                "output_cost_per_token_above_128k_tokens": null,
+                "output_cost_per_character_above_128k_tokens": null,
+                "output_vector_size": null,
+                "litellm_provider": "openai",
+                "mode": "chat"
+            }
+        },
+    ]
+}
+```
--- a/docs/my-website/docs/proxy/prometheus.md
+++ b/docs/my-website/docs/proxy/prometheus.md
@ -72,15 +72,15 @@ http://localhost:4000/metrics

 | Metric Name          | Description                          |
 |----------------------|--------------------------------------|
-| `deployment_state`             | The state of the deployment: 0 = healthy, 1 = partial outage, 2 = complete outage. |
+| `litellm_deployment_state`             | The state of the deployment: 0 = healthy, 1 = partial outage, 2 = complete outage. |
 | `litellm_remaining_requests_metric`             | Track `x-ratelimit-remaining-requests` returned from LLM API Deployment |
 | `litellm_remaining_tokens`                | Track `x-ratelimit-remaining-tokens` return from LLM API Deployment |
- `llm_deployment_success_responses`              |  Total number of successful LLM API calls for deployment                               |
-| `llm_deployment_failure_responses`              | Total number of failed LLM API calls for deployment                                   |
-| `llm_deployment_total_requests`                 | Total number of LLM API calls for deployment - success + failure                      |
-| `llm_deployment_latency_per_output_token`       | Latency per output token for deployment                                                          |
-| `llm_deployment_successful_fallbacks`           |  Number of successful fallback requests from primary model -> fallback model        |
-| `llm_deployment_failed_fallbacks`               | Number of failed fallback requests from primary model -> fallback model            |
+ `litellm_deployment_success_responses`              |  Total number of successful LLM API calls for deployment                               |
+| `litellm_deployment_failure_responses`              | Total number of failed LLM API calls for deployment                                   |
+| `litellm_deployment_total_requests`                 | Total number of LLM API calls for deployment - success + failure                      |
+| `litellm_deployment_latency_per_output_token`       | Latency per output token for deployment                                                          |
+| `litellm_deployment_successful_fallbacks`           |  Number of successful fallback requests from primary model -> fallback model        |
+| `litellm_deployment_failed_fallbacks`               | Number of failed fallback requests from primary model -> fallback model            |



--- a/litellm/integrations/gcs_bucket.py
+++ b/litellm/integrations/gcs_bucket.py
@ -1,5 +1,6 @@
 import json
 import os
+import uuid
 from datetime import datetime
 from typing import Any, Dict, List, Optional, TypedDict, Union

@ -29,6 +30,8 @@ class GCSBucketPayload(TypedDict):
    end_time: str
    response_cost: Optional[float]
    spend_log_metadata: str
+    exception: Optional[str]
+    log_event_type: Optional[str]


 class GCSBucketLogger(CustomLogger):
@ -79,6 +82,7 @@ class GCSBucketLogger(CustomLogger):
            logging_payload: GCSBucketPayload = await self.get_gcs_payload(
                kwargs, response_obj, start_time_str, end_time_str
            )
+            logging_payload["log_event_type"] = "successful_api_call"

            json_logged_payload = json.dumps(logging_payload)

@ -103,7 +107,56 @@ class GCSBucketLogger(CustomLogger):
            verbose_logger.error("GCS Bucket logging error: %s", str(e))

    async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
-        pass
+        from litellm.proxy.proxy_server import premium_user
+
+        if premium_user is not True:
+            raise ValueError(
+                f"GCS Bucket logging is a premium feature. Please upgrade to use it. {CommonProxyErrors.not_premium_user.value}"
+            )
+        try:
+            verbose_logger.debug(
+                "GCS Logger: async_log_failure_event logging kwargs: %s, response_obj: %s",
+                kwargs,
+                response_obj,
+            )
+
+            start_time_str = start_time.strftime("%Y-%m-%d %H:%M:%S")
+            end_time_str = end_time.strftime("%Y-%m-%d %H:%M:%S")
+            headers = await self.construct_request_headers()
+
+            logging_payload: GCSBucketPayload = await self.get_gcs_payload(
+                kwargs, response_obj, start_time_str, end_time_str
+            )
+            logging_payload["log_event_type"] = "failed_api_call"
+
+            _litellm_params = kwargs.get("litellm_params") or {}
+            metadata = _litellm_params.get("metadata") or {}
+
+            json_logged_payload = json.dumps(logging_payload)
+
+            # Get the current date
+            current_date = datetime.now().strftime("%Y-%m-%d")
+
+            # Modify the object_name to include the date-based folder
+            object_name = f"{current_date}/failure-{uuid.uuid4().hex}"
+
+            if "gcs_log_id" in metadata:
+                object_name = metadata["gcs_log_id"]
+
+            response = await self.async_httpx_client.post(
+                headers=headers,
+                url=f"https://storage.googleapis.com/upload/storage/v1/b/{self.BUCKET_NAME}/o?uploadType=media&name={object_name}",
+                data=json_logged_payload,
+            )
+
+            if response.status_code != 200:
+                verbose_logger.error("GCS Bucket logging error: %s", str(response.text))
+
+            verbose_logger.debug("GCS Bucket response %s", response)
+            verbose_logger.debug("GCS Bucket status code %s", response.status_code)
+            verbose_logger.debug("GCS Bucket response.text %s", response.text)
+        except Exception as e:
+            verbose_logger.error("GCS Bucket logging error: %s", str(e))

    async def construct_request_headers(self) -> Dict[str, str]:
        from litellm import vertex_chat_completion
@ -139,10 +192,19 @@ class GCSBucketLogger(CustomLogger):
            optional_params=kwargs.get("optional_params", None),
        )
        response_dict = {}
+        if response_obj:
            response_dict = convert_litellm_response_object_to_dict(
                response_obj=response_obj
            )

+        exception_str = None
+
+        # Handle logging exception attributes
+        if "exception" in kwargs:
+            exception_str = kwargs.get("exception", "")
+            if not isinstance(exception_str, str):
+                exception_str = str(exception_str)
+
        _spend_log_payload: SpendLogsPayload = get_logging_payload(
            kwargs=kwargs,
            response_obj=response_obj,
@ -156,8 +218,10 @@ class GCSBucketLogger(CustomLogger):
            response_obj=response_dict,
            start_time=start_time,
            end_time=end_time,
-            spend_log_metadata=_spend_log_payload["metadata"],
+            spend_log_metadata=_spend_log_payload.get("metadata", ""),
            response_cost=kwargs.get("response_cost", None),
+            exception=exception_str,
+            log_event_type=None,
        )

        return gcs_payload
--- a/litellm/integrations/prometheus.py
+++ b/litellm/integrations/prometheus.py
@ -141,42 +141,42 @@ class PrometheusLogger(CustomLogger):
                ]

                # Metric for deployment state
-                self.deployment_state = Gauge(
-                    "deployment_state",
+                self.litellm_deployment_state = Gauge(
+                    "litellm_deployment_state",
                    "LLM Deployment Analytics - The state of the deployment: 0 = healthy, 1 = partial outage, 2 = complete outage",
                    labelnames=_logged_llm_labels,
                )

-                self.llm_deployment_success_responses = Counter(
-                    name="llm_deployment_success_responses",
+                self.litellm_deployment_success_responses = Counter(
+                    name="litellm_deployment_success_responses",
                    documentation="LLM Deployment Analytics - Total number of successful LLM API calls via litellm",
                    labelnames=_logged_llm_labels,
                )
-                self.llm_deployment_failure_responses = Counter(
-                    name="llm_deployment_failure_responses",
+                self.litellm_deployment_failure_responses = Counter(
+                    name="litellm_deployment_failure_responses",
                    documentation="LLM Deployment Analytics - Total number of failed LLM API calls via litellm",
                    labelnames=_logged_llm_labels,
                )
-                self.llm_deployment_total_requests = Counter(
-                    name="llm_deployment_total_requests",
+                self.litellm_deployment_total_requests = Counter(
+                    name="litellm_deployment_total_requests",
                    documentation="LLM Deployment Analytics - Total number of LLM API calls via litellm - success + failure",
                    labelnames=_logged_llm_labels,
                )

                # Deployment Latency tracking
-                self.llm_deployment_latency_per_output_token = Histogram(
-                    name="llm_deployment_latency_per_output_token",
+                self.litellm_deployment_latency_per_output_token = Histogram(
+                    name="litellm_deployment_latency_per_output_token",
                    documentation="LLM Deployment Analytics - Latency per output token",
                    labelnames=_logged_llm_labels,
                )

-                self.llm_deployment_successful_fallbacks = Counter(
-                    "llm_deployment_successful_fallbacks",
+                self.litellm_deployment_successful_fallbacks = Counter(
+                    "litellm_deployment_successful_fallbacks",
                    "LLM Deployment Analytics - Number of successful fallback requests from primary model -> fallback model",
                    ["primary_model", "fallback_model"],
                )
-                self.llm_deployment_failed_fallbacks = Counter(
-                    "llm_deployment_failed_fallbacks",
+                self.litellm_deployment_failed_fallbacks = Counter(
+                    "litellm_deployment_failed_fallbacks",
                    "LLM Deployment Analytics - Number of failed fallback requests from primary model -> fallback model",
                    ["primary_model", "fallback_model"],
                )
@ -358,14 +358,14 @@ class PrometheusLogger(CustomLogger):
                api_provider=llm_provider,
            )

-            self.llm_deployment_failure_responses.labels(
+            self.litellm_deployment_failure_responses.labels(
                litellm_model_name=litellm_model_name,
                model_id=model_id,
                api_base=api_base,
                api_provider=llm_provider,
            ).inc()

-            self.llm_deployment_total_requests.labels(
+            self.litellm_deployment_total_requests.labels(
                litellm_model_name=litellm_model_name,
                model_id=model_id,
                api_base=api_base,
@ -438,14 +438,14 @@ class PrometheusLogger(CustomLogger):
                api_provider=llm_provider,
            )

-            self.llm_deployment_success_responses.labels(
+            self.litellm_deployment_success_responses.labels(
                litellm_model_name=litellm_model_name,
                model_id=model_id,
                api_base=api_base,
                api_provider=llm_provider,
            ).inc()

-            self.llm_deployment_total_requests.labels(
+            self.litellm_deployment_total_requests.labels(
                litellm_model_name=litellm_model_name,
                model_id=model_id,
                api_base=api_base,
@ -475,7 +475,7 @@ class PrometheusLogger(CustomLogger):
            latency_per_token = None
            if output_tokens is not None and output_tokens > 0:
                latency_per_token = _latency_seconds / output_tokens
-                self.llm_deployment_latency_per_output_token.labels(
+                self.litellm_deployment_latency_per_output_token.labels(
                    litellm_model_name=litellm_model_name,
                    model_id=model_id,
                    api_base=api_base,
@ -497,7 +497,7 @@ class PrometheusLogger(CustomLogger):
            kwargs,
        )
        _new_model = kwargs.get("model")
-        self.llm_deployment_successful_fallbacks.labels(
+        self.litellm_deployment_successful_fallbacks.labels(
            primary_model=original_model_group, fallback_model=_new_model
        ).inc()

@ -508,11 +508,11 @@ class PrometheusLogger(CustomLogger):
            kwargs,
        )
        _new_model = kwargs.get("model")
-        self.llm_deployment_failed_fallbacks.labels(
+        self.litellm_deployment_failed_fallbacks.labels(
            primary_model=original_model_group, fallback_model=_new_model
        ).inc()

-    def set_deployment_state(
+    def set_litellm_deployment_state(
        self,
        state: int,
        litellm_model_name: str,
@ -520,7 +520,7 @@ class PrometheusLogger(CustomLogger):
        api_base: str,
        api_provider: str,
    ):
-        self.deployment_state.labels(
+        self.litellm_deployment_state.labels(
            litellm_model_name, model_id, api_base, api_provider
        ).set(state)

@ -531,7 +531,7 @@ class PrometheusLogger(CustomLogger):
        api_base: str,
        api_provider: str,
    ):
-        self.set_deployment_state(
+        self.set_litellm_deployment_state(
            0, litellm_model_name, model_id, api_base, api_provider
        )

@ -542,7 +542,7 @@ class PrometheusLogger(CustomLogger):
        api_base: str,
        api_provider: str,
    ):
-        self.set_deployment_state(
+        self.set_litellm_deployment_state(
            1, litellm_model_name, model_id, api_base, api_provider
        )

@ -553,7 +553,7 @@ class PrometheusLogger(CustomLogger):
        api_base: str,
        api_provider: str,
    ):
-        self.set_deployment_state(
+        self.set_litellm_deployment_state(
            2, litellm_model_name, model_id, api_base, api_provider
        )

--- a/litellm/integrations/prometheus_helpers/prometheus_api.py
+++ b/litellm/integrations/prometheus_helpers/prometheus_api.py
@ -41,8 +41,8 @@ async def get_fallback_metric_from_prometheus():
    """
    response_message = ""
    relevant_metrics = [
-        "llm_deployment_successful_fallbacks_total",
-        "llm_deployment_failed_fallbacks_total",
+        "litellm_deployment_successful_fallbacks_total",
+        "litellm_deployment_failed_fallbacks_total",
    ]
    for metric in relevant_metrics:
        response_json = await get_metric_from_prometheus(
--- a/litellm/llms/anthropic.py
+++ b/litellm/llms/anthropic.py
@ -35,6 +35,7 @@ from litellm.types.llms.anthropic import (
    AnthropicResponseContentBlockText,
    AnthropicResponseContentBlockToolUse,
    AnthropicResponseUsageBlock,
+    AnthropicSystemMessageContent,
    ContentBlockDelta,
    ContentBlockStart,
    ContentBlockStop,
@ -759,6 +760,7 @@ class AnthropicChatCompletion(BaseLLM):
        ## CALCULATING USAGE
        prompt_tokens = completion_response["usage"]["input_tokens"]
        completion_tokens = completion_response["usage"]["output_tokens"]
+        _usage = completion_response["usage"]
        total_tokens = prompt_tokens + completion_tokens

        model_response.created = int(time.time())
@ -768,6 +770,11 @@ class AnthropicChatCompletion(BaseLLM):
            completion_tokens=completion_tokens,
            total_tokens=total_tokens,
        )
+
+        if "cache_creation_input_tokens" in _usage:
+            usage["cache_creation_input_tokens"] = _usage["cache_creation_input_tokens"]
+        if "cache_read_input_tokens" in _usage:
+            usage["cache_read_input_tokens"] = _usage["cache_read_input_tokens"]
        setattr(model_response, "usage", usage)  # type: ignore
        return model_response

@ -901,6 +908,7 @@ class AnthropicChatCompletion(BaseLLM):
            # Separate system prompt from rest of message
            system_prompt_indices = []
            system_prompt = ""
+            anthropic_system_message_list = None
            for idx, message in enumerate(messages):
                if message["role"] == "system":
                    valid_content: bool = False
@ -908,8 +916,23 @@ class AnthropicChatCompletion(BaseLLM):
                        system_prompt += message["content"]
                        valid_content = True
                    elif isinstance(message["content"], list):
-                        for content in message["content"]:
-                            system_prompt += content.get("text", "")
+                        for _content in message["content"]:
+                            anthropic_system_message_content = (
+                                AnthropicSystemMessageContent(
+                                    type=_content.get("type"),
+                                    text=_content.get("text"),
+                                )
+                            )
+                            if "cache_control" in _content:
+                                anthropic_system_message_content["cache_control"] = (
+                                    _content["cache_control"]
+                                )
+
+                            if anthropic_system_message_list is None:
+                                anthropic_system_message_list = []
+                            anthropic_system_message_list.append(
+                                anthropic_system_message_content
+                            )
                        valid_content = True

                    if valid_content:
@ -919,6 +942,10 @@ class AnthropicChatCompletion(BaseLLM):
                    messages.pop(idx)
            if len(system_prompt) > 0:
                optional_params["system"] = system_prompt
+
+            # Handling anthropic API Prompt Caching
+            if anthropic_system_message_list is not None:
+                optional_params["system"] = anthropic_system_message_list
            # Format rest of message according to anthropic guidelines
            try:
                messages = prompt_factory(
@ -954,6 +981,8 @@ class AnthropicChatCompletion(BaseLLM):
                else:  # assume openai tool call
                    new_tool = tool["function"]
                    new_tool["input_schema"] = new_tool.pop("parameters")  # rename key
+                    if "cache_control" in tool:
+                        new_tool["cache_control"] = tool["cache_control"]
                    anthropic_tools.append(new_tool)

            optional_params["tools"] = anthropic_tools
--- a/litellm/llms/ollama_chat.py
+++ b/litellm/llms/ollama_chat.py
@ -356,6 +356,7 @@ def ollama_completion_stream(url, api_key, data, logging_obj):
        "json": data,
        "method": "POST",
        "timeout": litellm.request_timeout,
+        "follow_redirects": True
    }
    if api_key is not None:
        _request["headers"] = {"Authorization": "Bearer {}".format(api_key)}
--- a/litellm/llms/prompt_templates/factory.py
+++ b/litellm/llms/prompt_templates/factory.py
@ -1224,6 +1224,19 @@ def convert_to_anthropic_tool_invoke(
    return anthropic_tool_invoke


+def add_cache_control_to_content(
+    anthropic_content_element: Union[
+        dict, AnthropicMessagesImageParam, AnthropicMessagesTextParam
+    ],
+    orignal_content_element: dict,
+):
+    if "cache_control" in orignal_content_element:
+        anthropic_content_element["cache_control"] = orignal_content_element[
+            "cache_control"
+        ]
+    return anthropic_content_element
+
+
 def anthropic_messages_pt(
    messages: list,
    model: str,
@ -1264,8 +1277,8 @@ def anthropic_messages_pt(
                        image_chunk = convert_to_anthropic_image_obj(
                            m["image_url"]["url"]
                        )
-                        user_content.append(
-                            AnthropicMessagesImageParam(
+
+                        _anthropic_content_element = AnthropicMessagesImageParam(
                            type="image",
                            source=AnthropicImageParamSource(
                                type="base64",
@ -1273,9 +1286,22 @@ def anthropic_messages_pt(
                                data=image_chunk["data"],
                            ),
                        )
+
+                        anthropic_content_element = add_cache_control_to_content(
+                            anthropic_content_element=_anthropic_content_element,
+                            orignal_content_element=m,
                        )
+                        user_content.append(anthropic_content_element)
                    elif m.get("type", "") == "text":
-                        user_content.append({"type": "text", "text": m["text"]})
+                        _anthropic_text_content_element = {
+                            "type": "text",
+                            "text": m["text"],
+                        }
+                        anthropic_content_element = add_cache_control_to_content(
+                            anthropic_content_element=_anthropic_text_content_element,
+                            orignal_content_element=m,
+                        )
+                        user_content.append(anthropic_content_element)
            elif (
                messages[msg_i]["role"] == "tool"
                or messages[msg_i]["role"] == "function"
@ -1306,6 +1332,10 @@ def anthropic_messages_pt(
                        anthropic_message = AnthropicMessagesTextParam(
                            type="text", text=m.get("text")
                        )
+                        anthropic_message = add_cache_control_to_content(
+                            anthropic_content_element=anthropic_message,
+                            orignal_content_element=m,
+                        )
                        assistant_content.append(anthropic_message)
            elif (
                "content" in messages[msg_i]
@ -1313,9 +1343,17 @@ def anthropic_messages_pt(
                and len(messages[msg_i]["content"])
                > 0  # don't pass empty text blocks. anthropic api raises errors.
            ):
-                assistant_content.append(
-                    {"type": "text", "text": messages[msg_i]["content"]}
+
+                _anthropic_text_content_element = {
+                    "type": "text",
+                    "text": messages[msg_i]["content"],
+                }
+
+                anthropic_content_element = add_cache_control_to_content(
+                    anthropic_content_element=_anthropic_text_content_element,
+                    orignal_content_element=messages[msg_i],
                )
+                assistant_content.append(anthropic_content_element)

            if messages[msg_i].get(
                "tool_calls", []
@ -1701,12 +1739,14 @@ def cohere_messages_pt_v2(
        assistant_tool_calls: List[ToolCallObject] = []
        ## MERGE CONSECUTIVE ASSISTANT CONTENT ##
        while msg_i < len(messages) and messages[msg_i]["role"] == "assistant":
-            assistant_text = (
-                messages[msg_i].get("content") or ""
-            )  # either string or none
-            if assistant_text:
-                assistant_content += assistant_text
-
+            if isinstance(messages[msg_i]["content"], list):
+                for m in messages[msg_i]["content"]:
+                    if m.get("type", "") == "text":
+                        assistant_content += m["text"]
+            elif messages[msg_i].get("content") is not None and isinstance(
+                messages[msg_i]["content"], str
+            ):
+                assistant_content += messages[msg_i]["content"]
            if messages[msg_i].get(
                "tool_calls", []
            ):  # support assistant tool invoke conversion
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -2074,7 +2074,8 @@
        "litellm_provider": "vertex_ai-anthropic_models",
        "mode": "chat",
        "supports_function_calling": true,
-        "supports_vision": true
+        "supports_vision": true,
+        "supports_assistant_prefill": true
    },
    "vertex_ai/claude-3-5-sonnet@20240620": {
        "max_tokens": 4096,
@ -2085,7 +2086,8 @@
        "litellm_provider": "vertex_ai-anthropic_models",
        "mode": "chat",
        "supports_function_calling": true,
-        "supports_vision": true
+        "supports_vision": true,
+        "supports_assistant_prefill": true
    },
    "vertex_ai/claude-3-haiku@20240307": {
        "max_tokens": 4096, 
@ -2096,7 +2098,8 @@
        "litellm_provider": "vertex_ai-anthropic_models",
        "mode": "chat",
        "supports_function_calling": true,
-        "supports_vision": true
+        "supports_vision": true,
+        "supports_assistant_prefill": true
    },
    "vertex_ai/claude-3-opus@20240229": {
        "max_tokens": 4096,
@ -2107,7 +2110,8 @@
        "litellm_provider": "vertex_ai-anthropic_models",
        "mode": "chat",
        "supports_function_calling": true,
-        "supports_vision": true
+        "supports_vision": true,
+        "supports_assistant_prefill": true
    },
    "vertex_ai/meta/llama3-405b-instruct-maas": {
        "max_tokens": 32000,
@ -4531,6 +4535,69 @@
        "litellm_provider": "perplexity", 
        "mode": "chat" 
    },
+    "perplexity/llama-3.1-70b-instruct": { 
+        "max_tokens": 131072,
+        "max_input_tokens": 131072,
+        "max_output_tokens": 131072,
+        "input_cost_per_token": 0.000001, 
+        "output_cost_per_token": 0.000001,
+        "litellm_provider": "perplexity", 
+        "mode": "chat" 
+    },
+    "perplexity/llama-3.1-8b-instruct": { 
+        "max_tokens": 131072,
+        "max_input_tokens": 131072,
+        "max_output_tokens": 131072,
+        "input_cost_per_token": 0.0000002, 
+        "output_cost_per_token": 0.0000002,  
+        "litellm_provider": "perplexity", 
+        "mode": "chat" 
+    },
+    "perplexity/llama-3.1-sonar-huge-128k-online": { 
+        "max_tokens": 127072,
+        "max_input_tokens": 127072,
+        "max_output_tokens": 127072,
+        "input_cost_per_token": 0.000005, 
+        "output_cost_per_token": 0.000005,
+        "litellm_provider": "perplexity", 
+        "mode": "chat" 
+    },
+    "perplexity/llama-3.1-sonar-large-128k-online": { 
+        "max_tokens": 127072,
+        "max_input_tokens": 127072,
+        "max_output_tokens": 127072,
+        "input_cost_per_token": 0.000001, 
+        "output_cost_per_token": 0.000001,
+        "litellm_provider": "perplexity", 
+        "mode": "chat" 
+    },
+    "perplexity/llama-3.1-sonar-large-128k-chat": { 
+        "max_tokens": 131072,
+        "max_input_tokens": 131072,
+        "max_output_tokens": 131072,
+        "input_cost_per_token": 0.000001, 
+        "output_cost_per_token": 0.000001,
+        "litellm_provider": "perplexity", 
+        "mode": "chat" 
+    },
+    "perplexity/llama-3.1-sonar-small-128k-chat": { 
+        "max_tokens": 131072,
+        "max_input_tokens": 131072,
+        "max_output_tokens": 131072,
+        "input_cost_per_token": 0.0000002, 
+        "output_cost_per_token": 0.0000002,  
+        "litellm_provider": "perplexity", 
+        "mode": "chat" 
+    },
+    "perplexity/llama-3.1-sonar-small-128k-online": { 
+        "max_tokens": 127072,
+        "max_input_tokens": 127072,
+        "max_output_tokens": 127072,
+        "input_cost_per_token": 0.0000002, 
+        "output_cost_per_token": 0.0000002,  
+        "litellm_provider": "perplexity", 
+        "mode": "chat" 
+    },
    "perplexity/pplx-7b-chat": { 
        "max_tokens": 8192,
        "max_input_tokens": 8192,
--- a/litellm/proxy/_experimental/out/404.html
+++ b/litellm/proxy/_experimental/out/404.html
--- a/litellm/proxy/_experimental/out/model_hub.html
+++ b/litellm/proxy/_experimental/out/model_hub.html
--- a/litellm/proxy/_experimental/out/onboarding.html
+++ b/litellm/proxy/_experimental/out/onboarding.html
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@ -1,7 +1,6 @@
 model_list:
-  - model_name: azure-embedding-model
+  - model_name: "gpt-4"
    litellm_params:
-      model: azure/azure-embedding-model
-      api_base: os.environ/AZURE_API_BASE
-      api_key: os.environ/AZURE_API_KEY
-      api_version: "2023-07-01-preview"
+      model: "gpt-4"
+    model_info:
+      my_custom_key: "my_custom_value"
--- a/litellm/proxy/auth/user_api_key_auth.py
+++ b/litellm/proxy/auth/user_api_key_auth.py
@ -85,6 +85,8 @@ def _get_bearer_token(
 ):
    if api_key.startswith("Bearer "):  # ensure Bearer token passed in
        api_key = api_key.replace("Bearer ", "")  # extract the token
+    elif api_key.startswith("Basic "):
+        api_key = api_key.replace("Basic ", "")  # handle langfuse input
    else:
        api_key = ""
    return api_key
@ -138,7 +140,6 @@ async def user_api_key_auth(
        pass_through_endpoints: Optional[List[dict]] = general_settings.get(
            "pass_through_endpoints", None
        )
-
        if isinstance(api_key, str):
            passed_in_key = api_key
            api_key = _get_bearer_token(api_key=api_key)
@ -367,6 +368,40 @@ async def user_api_key_auth(
                    parent_otel_span=parent_otel_span,
                )
        #### ELSE ####
+
+        ## CHECK PASS-THROUGH ENDPOINTS ##
+        if pass_through_endpoints is not None:
+            for endpoint in pass_through_endpoints:
+                if endpoint.get("path", "") == route:
+                    ## IF AUTH DISABLED
+                    if endpoint.get("auth") is not True:
+                        return UserAPIKeyAuth()
+                    ## IF AUTH ENABLED
+                    ### IF CUSTOM PARSER REQUIRED
+                    if (
+                        endpoint.get("custom_auth_parser") is not None
+                        and endpoint.get("custom_auth_parser") == "langfuse"
+                    ):
+                        """
+                        - langfuse returns {'Authorization': 'Basic YW55dGhpbmc6YW55dGhpbmc'}
+                        - check the langfuse public key if it contains the litellm api key
+                        """
+                        import base64
+
+                        api_key = api_key.replace("Basic ", "").strip()
+                        decoded_bytes = base64.b64decode(api_key)
+                        decoded_str = decoded_bytes.decode("utf-8")
+                        api_key = decoded_str.split(":")[0]
+                    else:
+                        headers = endpoint.get("headers", None)
+                        if headers is not None:
+                            header_key = headers.get("litellm_user_api_key", "")
+                            if (
+                                isinstance(request.headers, dict)
+                                and request.headers.get(key=header_key) is not None
+                            ):
+                                api_key = request.headers.get(key=header_key)
+
        if master_key is None:
            if isinstance(api_key, str):
                return UserAPIKeyAuth(
@ -533,7 +568,11 @@ async def user_api_key_auth(
        if isinstance(
            api_key, str
        ):  # if generated token, make sure it starts with sk-.
-            assert api_key.startswith("sk-")  # prevent token hashes from being used
+            assert api_key.startswith(
+                "sk-"
+            ), "LiteLLM Virtual Key expected. Received={}, expected to start with 'sk-'.".format(
+                api_key
+            )  # prevent token hashes from being used
        else:
            verbose_logger.warning(
                "litellm.proxy.proxy_server.user_api_key_auth(): Warning - Key={} is not a string.".format(
--- a/litellm/proxy/litellm_pre_call_utils.py
+++ b/litellm/proxy/litellm_pre_call_utils.py
@ -5,7 +5,12 @@ from fastapi import Request

 import litellm
 from litellm._logging import verbose_logger, verbose_proxy_logger
-from litellm.proxy._types import CommonProxyErrors, TeamCallbackMetadata, UserAPIKeyAuth
+from litellm.proxy._types import (
+    AddTeamCallback,
+    CommonProxyErrors,
+    TeamCallbackMetadata,
+    UserAPIKeyAuth,
+)
 from litellm.types.utils import SupportedCacheControls

 if TYPE_CHECKING:
@ -59,6 +64,42 @@ def safe_add_api_version_from_query_params(data: dict, request: Request):
        verbose_logger.error("error checking api version in query params: %s", str(e))


+def convert_key_logging_metadata_to_callback(
+    data: AddTeamCallback, team_callback_settings_obj: Optional[TeamCallbackMetadata]
+) -> TeamCallbackMetadata:
+    if team_callback_settings_obj is None:
+        team_callback_settings_obj = TeamCallbackMetadata()
+    if data.callback_type == "success":
+        if team_callback_settings_obj.success_callback is None:
+            team_callback_settings_obj.success_callback = []
+
+        if data.callback_name not in team_callback_settings_obj.success_callback:
+            team_callback_settings_obj.success_callback.append(data.callback_name)
+    elif data.callback_type == "failure":
+        if team_callback_settings_obj.failure_callback is None:
+            team_callback_settings_obj.failure_callback = []
+
+        if data.callback_name not in team_callback_settings_obj.failure_callback:
+            team_callback_settings_obj.failure_callback.append(data.callback_name)
+    elif data.callback_type == "success_and_failure":
+        if team_callback_settings_obj.success_callback is None:
+            team_callback_settings_obj.success_callback = []
+        if team_callback_settings_obj.failure_callback is None:
+            team_callback_settings_obj.failure_callback = []
+        if data.callback_name not in team_callback_settings_obj.success_callback:
+            team_callback_settings_obj.success_callback.append(data.callback_name)
+
+        if data.callback_name in team_callback_settings_obj.failure_callback:
+            team_callback_settings_obj.failure_callback.append(data.callback_name)
+
+    for var, value in data.callback_vars.items():
+        if team_callback_settings_obj.callback_vars is None:
+            team_callback_settings_obj.callback_vars = {}
+        team_callback_settings_obj.callback_vars[var] = litellm.get_secret(value)
+
+    return team_callback_settings_obj
+
+
 async def add_litellm_data_to_request(
    data: dict,
    request: Request,
@ -224,6 +265,7 @@ async def add_litellm_data_to_request(
            }  # add the team-specific configs to the completion call

    # Team Callbacks controls
+    callback_settings_obj: Optional[TeamCallbackMetadata] = None
    if user_api_key_dict.team_metadata is not None:
        team_metadata = user_api_key_dict.team_metadata
        if "callback_settings" in team_metadata:
@ -241,6 +283,18 @@ async def add_litellm_data_to_request(
            }
            }
            """
+    elif (
+        user_api_key_dict.metadata is not None
+        and "logging" in user_api_key_dict.metadata
+    ):
+        for item in user_api_key_dict.metadata["logging"]:
+
+            callback_settings_obj = convert_key_logging_metadata_to_callback(
+                data=AddTeamCallback(**item),
+                team_callback_settings_obj=callback_settings_obj,
+            )
+
+    if callback_settings_obj is not None:
        data["success_callback"] = callback_settings_obj.success_callback
        data["failure_callback"] = callback_settings_obj.failure_callback

--- a/litellm/proxy/pass_through_endpoints/pass_through_endpoints.py
+++ b/litellm/proxy/pass_through_endpoints/pass_through_endpoints.py
@ -309,7 +309,7 @@ async def pass_through_request(
            json=_parsed_body,
        )

-        if response.status_code != 200:
+        if response.status_code >= 300:
            raise HTTPException(status_code=response.status_code, detail=response.text)

        content = await response.aread()
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@ -39,7 +39,4 @@ general_settings:

 litellm_settings:
  fallbacks: [{"gemini-1.5-pro-001": ["gpt-4o"]}]
-  success_callback: ["langfuse", "prometheus"]
-  langfuse_default_tags: ["cache_hit", "cache_key", "proxy_base_url", "user_api_key_alias", "user_api_key_user_id", "user_api_key_user_email", "user_api_key_team_alias", "semantic-similarity", "proxy_base_url"]
-  failure_callback: ["prometheus"]
-  cache: True
+  callbacks: ["gcs_bucket"]
--- a/litellm/proxy/spend_tracking/spend_tracking_utils.py
+++ b/litellm/proxy/spend_tracking/spend_tracking_utils.py
@ -21,6 +21,8 @@ def get_logging_payload(

    if kwargs is None:
        kwargs = {}
+    if response_obj is None:
+        response_obj = {}
    # standardize this function to be used across, s3, dynamoDB, langfuse logging
    litellm_params = kwargs.get("litellm_params", {})
    metadata = (
--- a/litellm/router_utils/client_initalization_utils.py
+++ b/litellm/router_utils/client_initalization_utils.py
@ -190,7 +190,7 @@ def set_client(litellm_router_instance: LitellmRouter, model: dict):
                if azure_ad_token.startswith("oidc/"):
                    azure_ad_token = get_azure_ad_token_from_oidc(azure_ad_token)
            if api_version is None:
-                api_version = litellm.AZURE_DEFAULT_API_VERSION
+                api_version = os.getenv("AZURE_API_VERSION", litellm.AZURE_DEFAULT_API_VERSION)

            if "gateway.ai.cloudflare.com" in api_base:
                if not api_base.endswith("/"):
--- a/litellm/tests/test_anthropic_prompt_caching.py
+++ b/litellm/tests/test_anthropic_prompt_caching.py
@ -0,0 +1,321 @@
+import json
+import os
+import sys
+import traceback
+
+from dotenv import load_dotenv
+
+load_dotenv()
+import io
+import os
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
+
+import os
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+import litellm
+from litellm import RateLimitError, Timeout, completion, completion_cost, embedding
+from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
+from litellm.llms.prompt_templates.factory import anthropic_messages_pt
+
+# litellm.num_retries =3
+litellm.cache = None
+litellm.success_callback = []
+user_message = "Write a short poem about the sky"
+messages = [{"content": user_message, "role": "user"}]
+
+
+def logger_fn(user_model_dict):
+    print(f"user_model_dict: {user_model_dict}")
+
+
+@pytest.fixture(autouse=True)
+def reset_callbacks():
+    print("\npytest fixture - resetting callbacks")
+    litellm.success_callback = []
+    litellm._async_success_callback = []
+    litellm.failure_callback = []
+    litellm.callbacks = []
+
+
+@pytest.mark.asyncio
+async def test_litellm_anthropic_prompt_caching_tools():
+    # Arrange: Set up the MagicMock for the httpx.AsyncClient
+    mock_response = AsyncMock()
+
+    def return_val():
+        return {
+            "id": "msg_01XFDUDYJgAACzvnptvVoYEL",
+            "type": "message",
+            "role": "assistant",
+            "content": [{"type": "text", "text": "Hello!"}],
+            "model": "claude-3-5-sonnet-20240620",
+            "stop_reason": "end_turn",
+            "stop_sequence": None,
+            "usage": {"input_tokens": 12, "output_tokens": 6},
+        }
+
+    mock_response.json = return_val
+
+    litellm.set_verbose = True
+    with patch(
+        "litellm.llms.custom_httpx.http_handler.AsyncHTTPHandler.post",
+        return_value=mock_response,
+    ) as mock_post:
+        # Act: Call the litellm.acompletion function
+        response = await litellm.acompletion(
+            api_key="mock_api_key",
+            model="anthropic/claude-3-5-sonnet-20240620",
+            messages=[
+                {"role": "user", "content": "What's the weather like in Boston today?"}
+            ],
+            tools=[
+                {
+                    "type": "function",
+                    "function": {
+                        "name": "get_current_weather",
+                        "description": "Get the current weather in a given location",
+                        "parameters": {
+                            "type": "object",
+                            "properties": {
+                                "location": {
+                                    "type": "string",
+                                    "description": "The city and state, e.g. San Francisco, CA",
+                                },
+                                "unit": {
+                                    "type": "string",
+                                    "enum": ["celsius", "fahrenheit"],
+                                },
+                            },
+                            "required": ["location"],
+                        },
+                        "cache_control": {"type": "ephemeral"},
+                    },
+                }
+            ],
+            extra_headers={
+                "anthropic-version": "2023-06-01",
+                "anthropic-beta": "prompt-caching-2024-07-31",
+            },
+        )
+
+        # Print what was called on the mock
+        print("call args=", mock_post.call_args)
+
+        expected_url = "https://api.anthropic.com/v1/messages"
+        expected_headers = {
+            "accept": "application/json",
+            "content-type": "application/json",
+            "anthropic-version": "2023-06-01",
+            "anthropic-beta": "prompt-caching-2024-07-31",
+            "x-api-key": "mock_api_key",
+        }
+
+        expected_json = {
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "What's the weather like in Boston today?",
+                        }
+                    ],
+                }
+            ],
+            "tools": [
+                {
+                    "name": "get_current_weather",
+                    "description": "Get the current weather in a given location",
+                    "cache_control": {"type": "ephemeral"},
+                    "input_schema": {
+                        "type": "object",
+                        "properties": {
+                            "location": {
+                                "type": "string",
+                                "description": "The city and state, e.g. San Francisco, CA",
+                            },
+                            "unit": {
+                                "type": "string",
+                                "enum": ["celsius", "fahrenheit"],
+                            },
+                        },
+                        "required": ["location"],
+                    },
+                }
+            ],
+            "max_tokens": 4096,
+            "model": "claude-3-5-sonnet-20240620",
+        }
+
+        mock_post.assert_called_once_with(
+            expected_url, json=expected_json, headers=expected_headers, timeout=600.0
+        )
+
+
+@pytest.mark.asyncio()
+async def test_anthropic_api_prompt_caching_basic():
+    litellm.set_verbose = True
+    response = await litellm.acompletion(
+        model="anthropic/claude-3-5-sonnet-20240620",
+        messages=[
+            # System Message
+            {
+                "role": "system",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "Here is the full text of a complex legal agreement"
+                        * 400,
+                        "cache_control": {"type": "ephemeral"},
+                    }
+                ],
+            },
+            # marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache.
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What are the key terms and conditions in this agreement?",
+                        "cache_control": {"type": "ephemeral"},
+                    }
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
+            },
+            # The final turn is marked with cache-control, for continuing in followups.
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What are the key terms and conditions in this agreement?",
+                        "cache_control": {"type": "ephemeral"},
+                    }
+                ],
+            },
+        ],
+        temperature=0.2,
+        max_tokens=10,
+        extra_headers={
+            "anthropic-version": "2023-06-01",
+            "anthropic-beta": "prompt-caching-2024-07-31",
+        },
+    )
+
+    print("response=", response)
+
+    assert "cache_read_input_tokens" in response.usage
+    assert "cache_creation_input_tokens" in response.usage
+
+    # Assert either a cache entry was created or cache was read - changes depending on the anthropic api ttl
+    assert (response.usage.cache_read_input_tokens > 0) or (
+        response.usage.cache_creation_input_tokens > 0
+    )
+
+
+@pytest.mark.asyncio
+async def test_litellm_anthropic_prompt_caching_system():
+    # https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching#prompt-caching-examples
+    # LArge Context Caching Example
+    mock_response = AsyncMock()
+
+    def return_val():
+        return {
+            "id": "msg_01XFDUDYJgAACzvnptvVoYEL",
+            "type": "message",
+            "role": "assistant",
+            "content": [{"type": "text", "text": "Hello!"}],
+            "model": "claude-3-5-sonnet-20240620",
+            "stop_reason": "end_turn",
+            "stop_sequence": None,
+            "usage": {"input_tokens": 12, "output_tokens": 6},
+        }
+
+    mock_response.json = return_val
+
+    litellm.set_verbose = True
+    with patch(
+        "litellm.llms.custom_httpx.http_handler.AsyncHTTPHandler.post",
+        return_value=mock_response,
+    ) as mock_post:
+        # Act: Call the litellm.acompletion function
+        response = await litellm.acompletion(
+            api_key="mock_api_key",
+            model="anthropic/claude-3-5-sonnet-20240620",
+            messages=[
+                {
+                    "role": "system",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "You are an AI assistant tasked with analyzing legal documents.",
+                        },
+                        {
+                            "type": "text",
+                            "text": "Here is the full text of a complex legal agreement",
+                            "cache_control": {"type": "ephemeral"},
+                        },
+                    ],
+                },
+                {
+                    "role": "user",
+                    "content": "what are the key terms and conditions in this agreement?",
+                },
+            ],
+            extra_headers={
+                "anthropic-version": "2023-06-01",
+                "anthropic-beta": "prompt-caching-2024-07-31",
+            },
+        )
+
+        # Print what was called on the mock
+        print("call args=", mock_post.call_args)
+
+        expected_url = "https://api.anthropic.com/v1/messages"
+        expected_headers = {
+            "accept": "application/json",
+            "content-type": "application/json",
+            "anthropic-version": "2023-06-01",
+            "anthropic-beta": "prompt-caching-2024-07-31",
+            "x-api-key": "mock_api_key",
+        }
+
+        expected_json = {
+            "system": [
+                {
+                    "type": "text",
+                    "text": "You are an AI assistant tasked with analyzing legal documents.",
+                },
+                {
+                    "type": "text",
+                    "text": "Here is the full text of a complex legal agreement",
+                    "cache_control": {"type": "ephemeral"},
+                },
+            ],
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "what are the key terms and conditions in this agreement?",
+                        }
+                    ],
+                }
+            ],
+            "max_tokens": 4096,
+            "model": "claude-3-5-sonnet-20240620",
+        }
+
+        mock_post.assert_called_once_with(
+            expected_url, json=expected_json, headers=expected_headers, timeout=600.0
+        )
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -14,7 +14,7 @@ sys.path.insert(
 )  # Adds the parent directory to the system path

 import os
-from unittest.mock import MagicMock, patch
+from unittest.mock import AsyncMock, MagicMock, patch

 import pytest

@ -3474,7 +3474,6 @@ def response_format_tests(response: litellm.ModelResponse):
    assert isinstance(response.usage.total_tokens, int)  # type: ignore


-@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.parametrize(
    "model",
    [
@ -3488,6 +3487,7 @@ def response_format_tests(response: litellm.ModelResponse):
        "cohere.command-text-v14",
    ],
 )
+@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
 async def test_completion_bedrock_httpx_models(sync_mode, model):
    litellm.set_verbose = True
@ -3730,19 +3730,21 @@ def test_completion_anyscale_api():
 # test_completion_anyscale_api()


-@pytest.mark.skip(reason="flaky test, times out frequently")
+# @pytest.mark.skip(reason="flaky test, times out frequently")
 def test_completion_cohere():
    try:
        # litellm.set_verbose=True
        messages = [
            {"role": "system", "content": "You're a good bot"},
+            {"role": "assistant", "content": [{"text": "2", "type": "text"}]},
+            {"role": "assistant", "content": [{"text": "3", "type": "text"}]},
            {
                "role": "user",
                "content": "Hey",
            },
        ]
        response = completion(
-            model="command-nightly",
+            model="command-r",
            messages=messages,
        )
        print(response)
--- a/litellm/tests/test_function_call_parsing.py
+++ b/litellm/tests/test_function_call_parsing.py
@ -1,23 +1,27 @@
 # What is this?
 ## Test to make sure function call response always works with json.loads() -> no extra parsing required. Relevant issue - https://github.com/BerriAI/litellm/issues/2654
-import sys, os
+import os
+import sys
 import traceback
+
 from dotenv import load_dotenv

 load_dotenv()
-import os, io
+import io
+import os

 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
-import pytest
-import litellm
 import json
 import warnings
-
-from litellm import completion
 from typing import List

+import pytest
+
+import litellm
+from litellm import completion
+

 # Just a stub to keep the sample code simple
 class Trade:
@ -78,6 +82,7 @@ def trade(model_name: str) -> List[Trade]:
        },
    }

+    try:
        response = completion(
            model_name,
            [
@ -129,7 +134,8 @@ def trade(model_name: str) -> List[Trade]:
                "function": {"name": tool_spec["function"]["name"]},  # type: ignore
            },
        )
-
+    except litellm.InternalServerError:
+        pass
    calls = response.choices[0].message.tool_calls
    trades = [trade for call in calls for trade in parse_call(call)]
    return trades
--- a/litellm/tests/test_gcs_bucket.py
+++ b/litellm/tests/test_gcs_bucket.py
@ -147,6 +147,117 @@ async def test_basic_gcs_logger():

    assert gcs_payload["response_cost"] > 0.0

+    assert gcs_payload["log_event_type"] == "successful_api_call"
+    gcs_payload["spend_log_metadata"] = json.loads(gcs_payload["spend_log_metadata"])
+
+    assert (
+        gcs_payload["spend_log_metadata"]["user_api_key"]
+        == "88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b"
+    )
+    assert (
+        gcs_payload["spend_log_metadata"]["user_api_key_user_id"]
+        == "116544810872468347480"
+    )
+
+    # Delete Object from GCS
+    print("deleting object from GCS")
+    await gcs_logger.delete_gcs_object(object_name=object_name)
+
+
+@pytest.mark.asyncio
+async def test_basic_gcs_logger_failure():
+    load_vertex_ai_credentials()
+    gcs_logger = GCSBucketLogger()
+    print("GCSBucketLogger", gcs_logger)
+
+    gcs_log_id = f"failure-test-{uuid.uuid4().hex}"
+
+    litellm.callbacks = [gcs_logger]
+
+    try:
+        response = await litellm.acompletion(
+            model="gpt-3.5-turbo",
+            temperature=0.7,
+            messages=[{"role": "user", "content": "This is a test"}],
+            max_tokens=10,
+            user="ishaan-2",
+            mock_response=litellm.BadRequestError(
+                model="gpt-3.5-turbo",
+                message="Error: 400: Bad Request: Invalid API key, please check your API key and try again.",
+                llm_provider="openai",
+            ),
+            metadata={
+                "gcs_log_id": gcs_log_id,
+                "tags": ["model-anthropic-claude-v2.1", "app-ishaan-prod"],
+                "user_api_key": "88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",
+                "user_api_key_alias": None,
+                "user_api_end_user_max_budget": None,
+                "litellm_api_version": "0.0.0",
+                "global_max_parallel_requests": None,
+                "user_api_key_user_id": "116544810872468347480",
+                "user_api_key_org_id": None,
+                "user_api_key_team_id": None,
+                "user_api_key_team_alias": None,
+                "user_api_key_metadata": {},
+                "requester_ip_address": "127.0.0.1",
+                "spend_logs_metadata": {"hello": "world"},
+                "headers": {
+                    "content-type": "application/json",
+                    "user-agent": "PostmanRuntime/7.32.3",
+                    "accept": "*/*",
+                    "postman-token": "92300061-eeaa-423b-a420-0b44896ecdc4",
+                    "host": "localhost:4000",
+                    "accept-encoding": "gzip, deflate, br",
+                    "connection": "keep-alive",
+                    "content-length": "163",
+                },
+                "endpoint": "http://localhost:4000/chat/completions",
+                "model_group": "gpt-3.5-turbo",
+                "deployment": "azure/chatgpt-v-2",
+                "model_info": {
+                    "id": "4bad40a1eb6bebd1682800f16f44b9f06c52a6703444c99c7f9f32e9de3693b4",
+                    "db_model": False,
+                },
+                "api_base": "https://openai-gpt-4-test-v-1.openai.azure.com/",
+                "caching_groups": None,
+                "raw_request": "\n\nPOST Request Sent from LiteLLM:\ncurl -X POST \\\nhttps://openai-gpt-4-test-v-1.openai.azure.com//openai/ \\\n-H 'Authorization: *****' \\\n-d '{'model': 'chatgpt-v-2', 'messages': [{'role': 'system', 'content': 'you are a helpful assistant.\\n'}, {'role': 'user', 'content': 'bom dia'}], 'stream': False, 'max_tokens': 10, 'user': '116544810872468347480', 'extra_body': {}}'\n",
+            },
+        )
+    except:
+        pass
+
+    await asyncio.sleep(5)
+
+    # Get the current date
+    # Get the current date
+    current_date = datetime.now().strftime("%Y-%m-%d")
+
+    # Modify the object_name to include the date-based folder
+    object_name = gcs_log_id
+
+    print("object_name", object_name)
+
+    # Check if object landed on GCS
+    object_from_gcs = await gcs_logger.download_gcs_object(object_name=object_name)
+    print("object from gcs=", object_from_gcs)
+    # convert object_from_gcs from bytes to DICT
+    parsed_data = json.loads(object_from_gcs)
+    print("object_from_gcs as dict", parsed_data)
+
+    print("type of object_from_gcs", type(parsed_data))
+
+    gcs_payload = GCSBucketPayload(**parsed_data)
+
+    print("gcs_payload", gcs_payload)
+
+    assert gcs_payload["request_kwargs"]["model"] == "gpt-3.5-turbo"
+    assert gcs_payload["request_kwargs"]["messages"] == [
+        {"role": "user", "content": "This is a test"}
+    ]
+
+    assert gcs_payload["response_cost"] == 0
+    assert gcs_payload["log_event_type"] == "failed_api_call"
+
    gcs_payload["spend_log_metadata"] = json.loads(gcs_payload["spend_log_metadata"])

    assert (
--- a/litellm/tests/test_pass_through_endpoints.py
+++ b/litellm/tests/test_pass_through_endpoints.py
@ -1,5 +1,6 @@
 import os
 import sys
+from typing import Optional

 import pytest
 from fastapi import FastAPI
@ -30,6 +31,7 @@ def client():
 async def test_pass_through_endpoint(client, monkeypatch):
    # Mock the httpx.AsyncClient.request method
    monkeypatch.setattr("httpx.AsyncClient.request", mock_request)
+    import litellm

    # Define a pass-through endpoint
    pass_through_endpoints = [
@ -42,6 +44,11 @@ async def test_pass_through_endpoint(client, monkeypatch):

    # Initialize the pass-through endpoint
    await initialize_pass_through_endpoints(pass_through_endpoints)
+    general_settings: Optional[dict] = (
+        getattr(litellm.proxy.proxy_server, "general_settings", {}) or {}
+    )
+    general_settings.update({"pass_through_endpoints": pass_through_endpoints})
+    setattr(litellm.proxy.proxy_server, "general_settings", general_settings)

    # Make a request to the pass-through endpoint
    response = client.post("/test-endpoint", json={"prompt": "Hello, world!"})
@ -54,6 +61,7 @@ async def test_pass_through_endpoint(client, monkeypatch):
@pytest.mark.asyncio
 async def test_pass_through_endpoint_rerank(client):
    _cohere_api_key = os.environ.get("COHERE_API_KEY")
+    import litellm

    # Define a pass-through endpoint
    pass_through_endpoints = [
@ -66,6 +74,11 @@ async def test_pass_through_endpoint_rerank(client):

    # Initialize the pass-through endpoint
    await initialize_pass_through_endpoints(pass_through_endpoints)
+    general_settings: Optional[dict] = (
+        getattr(litellm.proxy.proxy_server, "general_settings", {}) or {}
+    )
+    general_settings.update({"pass_through_endpoints": pass_through_endpoints})
+    setattr(litellm.proxy.proxy_server, "general_settings", general_settings)

    _json_data = {
        "model": "rerank-english-v3.0",
@ -87,7 +100,7 @@ async def test_pass_through_endpoint_rerank(client):

@pytest.mark.parametrize(
    "auth, rpm_limit, expected_error_code",
-    [(True, 0, 429), (True, 1, 200), (False, 0, 401)],
+    [(True, 0, 429), (True, 1, 200), (False, 0, 200)],
 )
@pytest.mark.asyncio
 async def test_pass_through_endpoint_rpm_limit(auth, expected_error_code, rpm_limit):
@ -123,6 +136,11 @@ async def test_pass_through_endpoint_rpm_limit(auth, expected_error_code, rpm_li

    # Initialize the pass-through endpoint
    await initialize_pass_through_endpoints(pass_through_endpoints)
+    general_settings: Optional[dict] = (
+        getattr(litellm.proxy.proxy_server, "general_settings", {}) or {}
+    )
+    general_settings.update({"pass_through_endpoints": pass_through_endpoints})
+    setattr(litellm.proxy.proxy_server, "general_settings", general_settings)

    _json_data = {
        "model": "rerank-english-v3.0",
@ -146,6 +164,123 @@ async def test_pass_through_endpoint_rpm_limit(auth, expected_error_code, rpm_li
    assert response.status_code == expected_error_code


+@pytest.mark.parametrize(
+    "auth, rpm_limit, expected_error_code",
+    [(True, 0, 429), (True, 1, 207), (False, 0, 207)],
+)
+@pytest.mark.asyncio
+async def test_aaapass_through_endpoint_pass_through_keys_langfuse(
+    auth, expected_error_code, rpm_limit
+):
+
+    client = TestClient(app)
+    import litellm
+    from litellm.proxy._types import UserAPIKeyAuth
+    from litellm.proxy.proxy_server import ProxyLogging, hash_token, user_api_key_cache
+
+    # Store original values
+    original_user_api_key_cache = getattr(
+        litellm.proxy.proxy_server, "user_api_key_cache", None
+    )
+    original_master_key = getattr(litellm.proxy.proxy_server, "master_key", None)
+    original_prisma_client = getattr(litellm.proxy.proxy_server, "prisma_client", None)
+    original_proxy_logging_obj = getattr(
+        litellm.proxy.proxy_server, "proxy_logging_obj", None
+    )
+
+    try:
+
+        mock_api_key = "sk-my-test-key"
+        cache_value = UserAPIKeyAuth(
+            token=hash_token(mock_api_key), rpm_limit=rpm_limit
+        )
+
+        _cohere_api_key = os.environ.get("COHERE_API_KEY")
+
+        user_api_key_cache.set_cache(key=hash_token(mock_api_key), value=cache_value)
+
+        proxy_logging_obj = ProxyLogging(user_api_key_cache=user_api_key_cache)
+        proxy_logging_obj._init_litellm_callbacks()
+
+        setattr(litellm.proxy.proxy_server, "user_api_key_cache", user_api_key_cache)
+        setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
+        setattr(litellm.proxy.proxy_server, "prisma_client", "FAKE-VAR")
+        setattr(litellm.proxy.proxy_server, "proxy_logging_obj", proxy_logging_obj)
+
+        # Define a pass-through endpoint
+        pass_through_endpoints = [
+            {
+                "path": "/api/public/ingestion",
+                "target": "https://cloud.langfuse.com/api/public/ingestion",
+                "auth": auth,
+                "custom_auth_parser": "langfuse",
+                "headers": {
+                    "LANGFUSE_PUBLIC_KEY": "os.environ/LANGFUSE_PUBLIC_KEY",
+                    "LANGFUSE_SECRET_KEY": "os.environ/LANGFUSE_SECRET_KEY",
+                },
+            }
+        ]
+
+        # Initialize the pass-through endpoint
+        await initialize_pass_through_endpoints(pass_through_endpoints)
+        general_settings: Optional[dict] = (
+            getattr(litellm.proxy.proxy_server, "general_settings", {}) or {}
+        )
+        old_general_settings = general_settings
+        general_settings.update({"pass_through_endpoints": pass_through_endpoints})
+        setattr(litellm.proxy.proxy_server, "general_settings", general_settings)
+
+        _json_data = {
+            "batch": [
+                {
+                    "id": "80e2141f-0ca6-47b7-9c06-dde5e97de690",
+                    "type": "trace-create",
+                    "body": {
+                        "id": "0687af7b-4a75-4de8-a4f6-cba1cdc00865",
+                        "timestamp": "2024-08-14T02:38:56.092950Z",
+                        "name": "test-trace-litellm-proxy-passthrough",
+                    },
+                    "timestamp": "2024-08-14T02:38:56.093352Z",
+                }
+            ],
+            "metadata": {
+                "batch_size": 1,
+                "sdk_integration": "default",
+                "sdk_name": "python",
+                "sdk_version": "2.27.0",
+                "public_key": "anything",
+            },
+        }
+
+        # Make a request to the pass-through endpoint
+        response = client.post(
+            "/api/public/ingestion",
+            json=_json_data,
+            headers={"Authorization": "Basic c2stbXktdGVzdC1rZXk6YW55dGhpbmc="},
+        )
+
+        print("JSON response: ", _json_data)
+
+        print("RESPONSE RECEIVED - {}".format(response.text))
+
+        # Assert the response
+        assert response.status_code == expected_error_code
+
+        setattr(litellm.proxy.proxy_server, "general_settings", old_general_settings)
+    finally:
+        # Reset to original values
+        setattr(
+            litellm.proxy.proxy_server,
+            "user_api_key_cache",
+            original_user_api_key_cache,
+        )
+        setattr(litellm.proxy.proxy_server, "master_key", original_master_key)
+        setattr(litellm.proxy.proxy_server, "prisma_client", original_prisma_client)
+        setattr(
+            litellm.proxy.proxy_server, "proxy_logging_obj", original_proxy_logging_obj
+        )
+
+
@pytest.mark.asyncio
 async def test_pass_through_endpoint_anthropic(client):
    import litellm
@ -178,6 +313,11 @@ async def test_pass_through_endpoint_anthropic(client):

    # Initialize the pass-through endpoint
    await initialize_pass_through_endpoints(pass_through_endpoints)
+    general_settings: Optional[dict] = (
+        getattr(litellm.proxy.proxy_server, "general_settings", {}) or {}
+    )
+    general_settings.update({"pass_through_endpoints": pass_through_endpoints})
+    setattr(litellm.proxy.proxy_server, "general_settings", general_settings)

    _json_data = {
        "model": "gpt-3.5-turbo",
--- a/litellm/tests/test_prometheus.py
+++ b/litellm/tests/test_prometheus.py
@ -76,6 +76,6 @@ async def test_async_prometheus_success_logging():
    print("metrics from prometheus", metrics)
    assert metrics["litellm_requests_metric_total"] == 1.0
    assert metrics["litellm_total_tokens_total"] == 30.0
-    assert metrics["llm_deployment_success_responses_total"] == 1.0
-    assert metrics["llm_deployment_total_requests_total"] == 1.0
-    assert metrics["llm_deployment_latency_per_output_token_bucket"] == 1.0
+    assert metrics["litellm_deployment_success_responses_total"] == 1.0
+    assert metrics["litellm_deployment_total_requests_total"] == 1.0
+    assert metrics["litellm_deployment_latency_per_output_token_bucket"] == 1.0
--- a/litellm/tests/test_prompt_factory.py
+++ b/litellm/tests/test_prompt_factory.py
@ -260,3 +260,56 @@ def test_anthropic_messages_tool_call():
        translated_messages[-1]["content"][0]["tool_use_id"]
        == "bc8cb4b6-88c4-4138-8993-3a9d9cd51656"
    )
+
+
+def test_anthropic_cache_controls_pt():
+    "see anthropic docs for this: https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching#continuing-a-multi-turn-conversation"
+    messages = [
+        # marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache.
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What are the key terms and conditions in this agreement?",
+                    "cache_control": {"type": "ephemeral"},
+                }
+            ],
+        },
+        {
+            "role": "assistant",
+            "content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
+        },
+        # The final turn is marked with cache-control, for continuing in followups.
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What are the key terms and conditions in this agreement?",
+                    "cache_control": {"type": "ephemeral"},
+                }
+            ],
+        },
+        {
+            "role": "assistant",
+            "content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
+            "cache_control": {"type": "ephemeral"},
+        },
+    ]
+
+    translated_messages = anthropic_messages_pt(
+        messages, model="claude-3-5-sonnet-20240620", llm_provider="anthropic"
+    )
+
+    for i, msg in enumerate(translated_messages):
+        if i == 0:
+            assert msg["content"][0]["cache_control"] == {"type": "ephemeral"}
+        elif i == 1:
+            assert "cache_controls" not in msg["content"][0]
+        elif i == 2:
+            assert msg["content"][0]["cache_control"] == {"type": "ephemeral"}
+        elif i == 3:
+            assert msg["content"][0]["cache_control"] == {"type": "ephemeral"}
+
+    print("translated_messages: ", translated_messages)
--- a/litellm/tests/test_proxy_server.py
+++ b/litellm/tests/test_proxy_server.py
@ -966,3 +966,203 @@ async def test_user_info_team_list(prisma_client):
            pass

        mock_client.assert_called()
+
+
+@pytest.mark.skip(reason="Local test")
+@pytest.mark.asyncio
+async def test_add_callback_via_key(prisma_client):
+    """
+    Test if callback specified in key, is used.
+    """
+    global headers
+    import json
+
+    from fastapi import HTTPException, Request, Response
+    from starlette.datastructures import URL
+
+    from litellm.proxy.proxy_server import chat_completion
+
+    setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
+    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
+    await litellm.proxy.proxy_server.prisma_client.connect()
+
+    litellm.set_verbose = True
+
+    try:
+        # Your test data
+        test_data = {
+            "model": "azure/chatgpt-v-2",
+            "messages": [
+                {"role": "user", "content": "write 1 sentence poem"},
+            ],
+            "max_tokens": 10,
+            "mock_response": "Hello world",
+            "api_key": "my-fake-key",
+        }
+
+        request = Request(scope={"type": "http", "method": "POST", "headers": {}})
+        request._url = URL(url="/chat/completions")
+
+        json_bytes = json.dumps(test_data).encode("utf-8")
+
+        request._body = json_bytes
+
+        with patch.object(
+            litellm.litellm_core_utils.litellm_logging,
+            "LangFuseLogger",
+            new=MagicMock(),
+        ) as mock_client:
+            resp = await chat_completion(
+                request=request,
+                fastapi_response=Response(),
+                user_api_key_dict=UserAPIKeyAuth(
+                    metadata={
+                        "logging": [
+                            {
+                                "callback_name": "langfuse",  # 'otel', 'langfuse', 'lunary'
+                                "callback_type": "success",  # set, if required by integration - future improvement, have logging tools work for success + failure by default
+                                "callback_vars": {
+                                    "langfuse_public_key": "os.environ/LANGFUSE_PUBLIC_KEY",
+                                    "langfuse_secret_key": "os.environ/LANGFUSE_SECRET_KEY",
+                                    "langfuse_host": "https://us.cloud.langfuse.com",
+                                },
+                            }
+                        ]
+                    }
+                ),
+            )
+            print(resp)
+            mock_client.assert_called()
+            mock_client.return_value.log_event.assert_called()
+            args, kwargs = mock_client.return_value.log_event.call_args
+            kwargs = kwargs["kwargs"]
+            assert "user_api_key_metadata" in kwargs["litellm_params"]["metadata"]
+            assert (
+                "logging"
+                in kwargs["litellm_params"]["metadata"]["user_api_key_metadata"]
+            )
+            checked_keys = False
+            for item in kwargs["litellm_params"]["metadata"]["user_api_key_metadata"][
+                "logging"
+            ]:
+                for k, v in item["callback_vars"].items():
+                    print("k={}, v={}".format(k, v))
+                    if "key" in k:
+                        assert "os.environ" in v
+                        checked_keys = True
+
+            assert checked_keys
+    except Exception as e:
+        pytest.fail(f"LiteLLM Proxy test failed. Exception - {str(e)}")
+
+
+@pytest.mark.asyncio
+async def test_add_callback_via_key_litellm_pre_call_utils(prisma_client):
+    import json
+
+    from fastapi import HTTPException, Request, Response
+    from starlette.datastructures import URL
+
+    from litellm.proxy.litellm_pre_call_utils import add_litellm_data_to_request
+
+    setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
+    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
+    await litellm.proxy.proxy_server.prisma_client.connect()
+
+    proxy_config = getattr(litellm.proxy.proxy_server, "proxy_config")
+
+    request = Request(scope={"type": "http", "method": "POST", "headers": {}})
+    request._url = URL(url="/chat/completions")
+
+    test_data = {
+        "model": "azure/chatgpt-v-2",
+        "messages": [
+            {"role": "user", "content": "write 1 sentence poem"},
+        ],
+        "max_tokens": 10,
+        "mock_response": "Hello world",
+        "api_key": "my-fake-key",
+    }
+
+    json_bytes = json.dumps(test_data).encode("utf-8")
+
+    request._body = json_bytes
+
+    data = {
+        "data": {
+            "model": "azure/chatgpt-v-2",
+            "messages": [{"role": "user", "content": "write 1 sentence poem"}],
+            "max_tokens": 10,
+            "mock_response": "Hello world",
+            "api_key": "my-fake-key",
+        },
+        "request": request,
+        "user_api_key_dict": UserAPIKeyAuth(
+            token=None,
+            key_name=None,
+            key_alias=None,
+            spend=0.0,
+            max_budget=None,
+            expires=None,
+            models=[],
+            aliases={},
+            config={},
+            user_id=None,
+            team_id=None,
+            max_parallel_requests=None,
+            metadata={
+                "logging": [
+                    {
+                        "callback_name": "langfuse",
+                        "callback_type": "success",
+                        "callback_vars": {
+                            "langfuse_public_key": "os.environ/LANGFUSE_PUBLIC_KEY",
+                            "langfuse_secret_key": "os.environ/LANGFUSE_SECRET_KEY",
+                            "langfuse_host": "https://us.cloud.langfuse.com",
+                        },
+                    }
+                ]
+            },
+            tpm_limit=None,
+            rpm_limit=None,
+            budget_duration=None,
+            budget_reset_at=None,
+            allowed_cache_controls=[],
+            permissions={},
+            model_spend={},
+            model_max_budget={},
+            soft_budget_cooldown=False,
+            litellm_budget_table=None,
+            org_id=None,
+            team_spend=None,
+            team_alias=None,
+            team_tpm_limit=None,
+            team_rpm_limit=None,
+            team_max_budget=None,
+            team_models=[],
+            team_blocked=False,
+            soft_budget=None,
+            team_model_aliases=None,
+            team_member_spend=None,
+            team_metadata=None,
+            end_user_id=None,
+            end_user_tpm_limit=None,
+            end_user_rpm_limit=None,
+            end_user_max_budget=None,
+            last_refreshed_at=None,
+            api_key=None,
+            user_role=None,
+            allowed_model_region=None,
+            parent_otel_span=None,
+        ),
+        "proxy_config": proxy_config,
+        "general_settings": {},
+        "version": "0.0.0",
+    }
+
+    new_data = await add_litellm_data_to_request(**data)
+
+    assert "success_callback" in new_data
+    assert new_data["success_callback"] == ["langfuse"]
+    assert "langfuse_public_key" in new_data
+    assert "langfuse_secret_key" in new_data
--- a/litellm/types/llms/anthropic.py
+++ b/litellm/types/llms/anthropic.py
@ -15,9 +15,10 @@ class AnthropicMessagesTool(TypedDict, total=False):
    input_schema: Required[dict]


-class AnthropicMessagesTextParam(TypedDict):
+class AnthropicMessagesTextParam(TypedDict, total=False):
    type: Literal["text"]
    text: str
+    cache_control: Optional[dict]


 class AnthropicMessagesToolUseParam(TypedDict):
@ -54,9 +55,10 @@ class AnthropicImageParamSource(TypedDict):
    data: str


-class AnthropicMessagesImageParam(TypedDict):
+class AnthropicMessagesImageParam(TypedDict, total=False):
    type: Literal["image"]
    source: AnthropicImageParamSource
+    cache_control: Optional[dict]


 class AnthropicMessagesToolResultContent(TypedDict):
@ -92,6 +94,12 @@ class AnthropicMetadata(TypedDict, total=False):
    user_id: str


+class AnthropicSystemMessageContent(TypedDict, total=False):
+    type: str
+    text: str
+    cache_control: Optional[dict]
+
+
 class AnthropicMessagesRequest(TypedDict, total=False):
    model: Required[str]
    messages: Required[
@ -106,7 +114,7 @@ class AnthropicMessagesRequest(TypedDict, total=False):
    metadata: AnthropicMetadata
    stop_sequences: List[str]
    stream: bool
-    system: str
+    system: Union[str, List]
    temperature: float
    tool_choice: AnthropicMessagesToolChoice
    tools: List[AnthropicMessagesTool]
--- a/litellm/types/llms/openai.py
+++ b/litellm/types/llms/openai.py
@ -361,7 +361,7 @@ class ChatCompletionToolMessage(TypedDict):

 class ChatCompletionSystemMessage(TypedDict, total=False):
    role: Required[Literal["system"]]
-    content: Required[str]
+    content: Required[Union[str, List]]
    name: str


--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -2074,7 +2074,8 @@
        "litellm_provider": "vertex_ai-anthropic_models",
        "mode": "chat",
        "supports_function_calling": true,
-        "supports_vision": true
+        "supports_vision": true,
+        "supports_assistant_prefill": true
    },
    "vertex_ai/claude-3-5-sonnet@20240620": {
        "max_tokens": 4096,
@ -2085,7 +2086,8 @@
        "litellm_provider": "vertex_ai-anthropic_models",
        "mode": "chat",
        "supports_function_calling": true,
-        "supports_vision": true
+        "supports_vision": true,
+        "supports_assistant_prefill": true
    },
    "vertex_ai/claude-3-haiku@20240307": {
        "max_tokens": 4096, 
@ -2096,7 +2098,8 @@
        "litellm_provider": "vertex_ai-anthropic_models",
        "mode": "chat",
        "supports_function_calling": true,
-        "supports_vision": true
+        "supports_vision": true,
+        "supports_assistant_prefill": true
    },
    "vertex_ai/claude-3-opus@20240229": {
        "max_tokens": 4096,
@ -2107,7 +2110,8 @@
        "litellm_provider": "vertex_ai-anthropic_models",
        "mode": "chat",
        "supports_function_calling": true,
-        "supports_vision": true
+        "supports_vision": true,
+        "supports_assistant_prefill": true
    },
    "vertex_ai/meta/llama3-405b-instruct-maas": {
        "max_tokens": 32000,
@ -4531,6 +4535,69 @@
        "litellm_provider": "perplexity", 
        "mode": "chat" 
    },
+    "perplexity/llama-3.1-70b-instruct": { 
+        "max_tokens": 131072,
+        "max_input_tokens": 131072,
+        "max_output_tokens": 131072,
+        "input_cost_per_token": 0.000001, 
+        "output_cost_per_token": 0.000001,
+        "litellm_provider": "perplexity", 
+        "mode": "chat" 
+    },
+    "perplexity/llama-3.1-8b-instruct": { 
+        "max_tokens": 131072,
+        "max_input_tokens": 131072,
+        "max_output_tokens": 131072,
+        "input_cost_per_token": 0.0000002, 
+        "output_cost_per_token": 0.0000002,  
+        "litellm_provider": "perplexity", 
+        "mode": "chat" 
+    },
+    "perplexity/llama-3.1-sonar-huge-128k-online": { 
+        "max_tokens": 127072,
+        "max_input_tokens": 127072,
+        "max_output_tokens": 127072,
+        "input_cost_per_token": 0.000005, 
+        "output_cost_per_token": 0.000005,
+        "litellm_provider": "perplexity", 
+        "mode": "chat" 
+    },
+    "perplexity/llama-3.1-sonar-large-128k-online": { 
+        "max_tokens": 127072,
+        "max_input_tokens": 127072,
+        "max_output_tokens": 127072,
+        "input_cost_per_token": 0.000001, 
+        "output_cost_per_token": 0.000001,
+        "litellm_provider": "perplexity", 
+        "mode": "chat" 
+    },
+    "perplexity/llama-3.1-sonar-large-128k-chat": { 
+        "max_tokens": 131072,
+        "max_input_tokens": 131072,
+        "max_output_tokens": 131072,
+        "input_cost_per_token": 0.000001, 
+        "output_cost_per_token": 0.000001,
+        "litellm_provider": "perplexity", 
+        "mode": "chat" 
+    },
+    "perplexity/llama-3.1-sonar-small-128k-chat": { 
+        "max_tokens": 131072,
+        "max_input_tokens": 131072,
+        "max_output_tokens": 131072,
+        "input_cost_per_token": 0.0000002, 
+        "output_cost_per_token": 0.0000002,  
+        "litellm_provider": "perplexity", 
+        "mode": "chat" 
+    },
+    "perplexity/llama-3.1-sonar-small-128k-online": { 
+        "max_tokens": 127072,
+        "max_input_tokens": 127072,
+        "max_output_tokens": 127072,
+        "input_cost_per_token": 0.0000002, 
+        "output_cost_per_token": 0.0000002,  
+        "litellm_provider": "perplexity", 
+        "mode": "chat" 
+    },
    "perplexity/pplx-7b-chat": { 
        "max_tokens": 8192,
        "max_input_tokens": 8192,
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.43.10"
+version = "1.43.13"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@ -91,7 +91,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"

 [tool.commitizen]
-version = "1.43.10"
+version = "1.43.13"
 version_files = [
    "pyproject.toml:^version"
 ]