Merge branch 'main' into litellm_parallel_requests

2024-07-24 19:25:56 -07:00 · 2024-07-24 19:25:56 -07:00 · e6963217ba
commit e6963217ba
parent 9d10881f3d 0ac7736b1f
79 changed files with 3913 additions and 180 deletions
--- a/README.md
+++ b/README.md
@ -8,7 +8,7 @@
          <img src="https://railway.app/button.svg" alt="Deploy on Railway">
        </a>
        </p>
-        <p align="center">Call all LLM APIs using the OpenAI format [Bedrock, Huggingface, VertexAI, TogetherAI, Azure, OpenAI, etc.]
+        <p align="center">Call all LLM APIs using the OpenAI format [Bedrock, Huggingface, VertexAI, TogetherAI, Azure, OpenAI, Groq etc.]
        <br>
    </p>
 <h4 align="center"><a href="https://docs.litellm.ai/docs/simple_proxy" target="_blank">OpenAI Proxy Server</a> | <a href="https://docs.litellm.ai/docs/hosted" target="_blank"> Hosted Proxy (Preview)</a> | <a href="https://docs.litellm.ai/docs/enterprise"target="_blank">Enterprise Tier</a></h4>
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -9,13 +9,11 @@ services:
    #########################################
    ## Uncomment these lines to start proxy with a config.yaml file ##
    # volumes:
    #   - ./proxy_server_config.yaml:/app/config.yaml
    # command: [ "--config", "./config.yaml", "--port", "4000"]
    ###############################################
    ports:
      - "4000:4000" # Map the container port to the host, change the host port if necessary
    environment:
-        DATABASE_URL: "postgresql://postgres:example@db:5432/postgres"
+        DATABASE_URL: "postgresql://llmproxy:dbpassword9090@db:5432/litellm"
        STORE_MODEL_IN_DB: "True" # allows adding models to proxy via UI
    env_file:
      - .env # Load local .env file
@ -25,11 +23,31 @@ services:
    image: postgres
    restart: always
    environment:
-      POSTGRES_PASSWORD: example
+      POSTGRES_DB: litellm
      POSTGRES_USER: llmproxy
      POSTGRES_PASSWORD: dbpassword9090
    healthcheck:
-      test: ["CMD-SHELL", "pg_isready"]
+      test: ["CMD-SHELL", "pg_isready -d litellm -U llmproxy"]
      interval: 1s
      timeout: 5s
      retries: 10
  prometheus:
    image: prom/prometheus
    volumes:
      - prometheus_data:/prometheus
      - ./prometheus.yml:/etc/prometheus/prometheus.yml
    ports:
      - "9090:9090"
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--storage.tsdb.retention.time=15d'
    restart: always
-# ...rest of your docker-compose config if any
+volumes:
  prometheus_data:
    driver: local
 # ...rest of your docker-compose config if any
--- a/docs/my-website/docs/observability/arize_integration.md
+++ b/docs/my-website/docs/observability/arize_integration.md
@ -0,0 +1,72 @@
 import Image from '@theme/IdealImage';
 # 🔥 Arize AI - Logging LLM Input/Output
 AI Observability and Evaluation Platform
 :::tip
 This is community maintained, Please make an issue if you run into a bug
 https://github.com/BerriAI/litellm
 :::
 ## Pre-Requisites
 Make an account on [Arize AI](https://app.arize.com/auth/login)
 ## Quick Start
 Use just 2 lines of code, to instantly log your responses **across all providers** with arize
 ```python
 litellm.callbacks = ["arize"]
 ```
 ```python
 import litellm
 import os
 os.environ["ARIZE_SPACE_KEY"] = ""
 os.environ["ARIZE_API_KEY"] = "" # defaults to litellm-completion
 # LLM API Keys
 os.environ['OPENAI_API_KEY']=""
 # set arize as a callback, litellm will send the data to arize
 litellm.callbacks = ["arize"]
 # openai call
 response = litellm.completion(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "user", "content": "Hi 👋 - i'm openai"}
  ]
 )
 ```
 ### Using with LiteLLM Proxy
 ```yaml
 model_list:
  - model_name: gpt-4
    litellm_params:
      model: openai/fake
      api_key: fake-key
      api_base: https://exampleopenaiendpoint-production.up.railway.app/
 litellm_settings:
  callbacks: ["arize"]
 environment_variables:
    ARIZE_SPACE_KEY: "d0*****"
    ARIZE_API_KEY: "141a****"
 ```
 ## Support & Talk to Founders
 - [Schedule Demo 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version)
 - [Community Discord 💭](https://discord.gg/wuPM9dRgDw)
 - Our numbers 📞 +1 (770) 8783-106 / ‭+1 (412) 618-6238‬
 - Our emails ✉️ ishaan@berri.ai / krrish@berri.ai
--- a/docs/my-website/docs/observability/braintrust.md
+++ b/docs/my-website/docs/observability/braintrust.md
@ -0,0 +1,147 @@
 import Image from '@theme/IdealImage';
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # ⚡️ Braintrust - Evals + Logging 
 [Braintrust](https://www.braintrust.dev/) manages evaluations, logging, prompt playground, to data management for AI products.
 ## Quick Start
 ```python
 # pip install langfuse 
 import litellm
 import os
 # set env 
 os.environ["BRAINTRUST_API_KEY"] = "" 
 os.environ['OPENAI_API_KEY']=""
 # set braintrust as a callback, litellm will send the data to braintrust
 litellm.callbacks = ["braintrust"] 
 # openai call
 response = litellm.completion(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "user", "content": "Hi 👋 - i'm openai"}
  ]
 )
 ```
 ## OpenAI Proxy Usage
 1. Add keys to env 
 ```env
 BRAINTRUST_API_KEY="" 
 ```
 2. Add braintrust to callbacks 
 ```yaml
 model_list:
  - model_name: gpt-3.5-turbo
    litellm_params:
      model: gpt-3.5-turbo
      api_key: os.environ/OPENAI_API_KEY
 litellm_settings:
  callbacks: ["braintrust"]
 ```
 3. Test it! 
 ```bash
 curl -X POST 'http://0.0.0.0:4000/chat/completions' \
 -H 'Content-Type: application/json' \
 -H 'Authorization: Bearer sk-1234' \
 -D '{
    "model": "groq-llama3",
    "messages": [
        { "role": "system", "content": "Use your tools smartly"},
        { "role": "user", "content": "What time is it now? Use your tool"}
    ]
 }'
 ```
 ## Advanced - pass Project ID 
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 response = litellm.completion(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "user", "content": "Hi 👋 - i'm openai"}
  ], 
  metadata={
    "project_id": "my-special-project" 
  }
 )
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 **Curl**
 ```bash
 curl -X POST 'http://0.0.0.0:4000/chat/completions' \
 -H 'Content-Type: application/json' \
 -H 'Authorization: Bearer sk-1234' \
 -D '{
    "model": "groq-llama3",
    "messages": [
        { "role": "system", "content": "Use your tools smartly"},
        { "role": "user", "content": "What time is it now? Use your tool"}
    ],
    "metadata": {
        "project_id": "my-special-project"
    }
 }'
 ```
 **OpenAI SDK**
 ```python
 import openai
 client = openai.OpenAI(
    api_key="anything",
    base_url="http://0.0.0.0:4000"
 )
 # request sent to model set on litellm proxy, `litellm --model`
 response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages = [
        {
            "role": "user",
            "content": "this is a test request, write a short poem"
        }
    ],
    extra_body={ # pass in any provider-specific param, if not supported by openai, https://docs.litellm.ai/docs/completion/input#provider-specific-params
        "metadata": { # 👈 use for logging additional params (e.g. to langfuse)
            "project_id": "my-special-project"
        }
    }
 )
 print(response)
 ```
 For more examples, [**Click Here**](../proxy/user_keys.md#chatcompletions)
 </TabItem>
 </Tabs>
 ## Full API Spec 
 Here's everything you can pass in metadata for a braintrust request 
 `braintrust_*` - any metadata field starting with `braintrust_` will be passed as metadata to the logging request 
 `project_id`  - set the project id for a braintrust call. Default is `litellm`. 
--- a/docs/my-website/docs/observability/helicone_integration.md
+++ b/docs/my-website/docs/observability/helicone_integration.md
@ -1,4 +1,4 @@
-# 🧠 Helicone - OSS LLM Observability Platform
+# 🧊 Helicone - OSS LLM Observability Platform
 :::tip
--- a/docs/my-website/docs/observability/langsmith_integration.md
+++ b/docs/my-website/docs/observability/langsmith_integration.md
@ -1,6 +1,6 @@
 import Image from '@theme/IdealImage';
-# Langsmith - Logging LLM Input/Output
+# 🦜 Langsmith - Logging LLM Input/Output
 :::tip
@ -56,7 +56,7 @@ response = litellm.completion(
 ```
 ## Advanced
-### Set Custom Project & Run names
+### Set Langsmith fields - Custom Projec, Run names, tags
 ```python
 import litellm
@ -77,6 +77,7 @@ response = litellm.completion(
    metadata={
        "run_name": "litellmRUN",               # langsmith run name
        "project_name": "litellm-completion",   # langsmith project name
        "tags": ["model1", "prod-2"]            # tags to log on langsmith
    }
 )
 print(response)
--- a/docs/my-website/docs/observability/raw_request_response.md
+++ b/docs/my-website/docs/observability/raw_request_response.md
@ -1,10 +1,16 @@
 import Image from '@theme/IdealImage';
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # Raw Request/Response Logging
 ## Logging
 See the raw request/response sent by LiteLLM in your logging provider (OTEL/Langfuse/etc.).
-**on SDK**
+<Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 # pip install langfuse 
 import litellm
@ -34,13 +40,85 @@ response = litellm.completion(
 )
 ```
-**on Proxy**
+
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 ```yaml
 litellm_settings:
  log_raw_request_response: True
 ```
 </TabItem>
 </Tabs>
 **Expected Log**
-<Image img={require('../../img/raw_request_log.png')}/>
+<Image img={require('../../img/raw_request_log.png')}/>
 ## Return Raw Response Headers 
 Return raw response headers from llm provider. 
 Currently only supported for openai. 
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 import litellm
 import os
 litellm.return_response_headers = True
 ## set ENV variables
 os.environ["OPENAI_API_KEY"] = "your-api-key"
 response = litellm.completion(
  model="gpt-3.5-turbo",
  messages=[{ "content": "Hello, how are you?","role": "user"}]
 )
 print(response._hidden_params)
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 1. Setup config.yaml
 ```yaml
 model_list:
  - model_name: gpt-3.5-turbo
    litellm_params:
      model: gpt-3.5-turbo
      api_key: os.environ/GROQ_API_KEY
 litellm_settings:
  return_response_headers: true
 ```
 2. Test it!
 ```bash
 curl -X POST 'http://0.0.0.0:4000/chat/completions' \
 -H 'Content-Type: application/json' \
 -H 'Authorization: Bearer sk-1234' \
 -D '{
    "model": "gpt-3.5-turbo",
    "messages": [
        { "role": "system", "content": "Use your tools smartly"},
        { "role": "user", "content": "What time is it now? Use your tool"}
    ]
 }'
 ```
 </TabItem>
 </Tabs>
 **Expected Response**
 <Image img={require('../../img/raw_response_headers.png')}/>
--- a/docs/my-website/docs/oidc.md
+++ b/docs/my-website/docs/oidc.md
@ -0,0 +1,223 @@
 # OpenID Connect (OIDC)
 LiteLLM supports using OpenID Connect (OIDC) for authentication to upstream services . This allows you to avoid storing sensitive credentials in your configuration files.
 ## OIDC Identity Provider (IdP)
 LiteLLM supports the following OIDC identity providers:
 | Provider                 | Config Name  | Custom Audiences |
 | -------------------------| ------------ | ---------------- |
 | Google Cloud Run         | `google`     | Yes              |
 | CircleCI v1              | `circleci`   | No               |
 | CircleCI v2              | `circleci_v2`| No               |
 | GitHub Actions           | `github`     | Yes              |
 | Azure Kubernetes Service | `azure`      | No               |
 If you would like to use a different OIDC provider, please open an issue on GitHub.
 ## OIDC Connect Relying Party (RP)
 LiteLLM supports the following OIDC relying parties / clients:
 - Amazon Bedrock
 - Azure OpenAI
 - _(Coming soon) Google Cloud Vertex AI_
 ### Configuring OIDC
 Wherever a secret key can be used, OIDC can be used in-place. The general format is:
 ```
 oidc/config_name_here/audience_here
 ```
 For providers that do not use the `audience` parameter, you can (and should) omit it:
 ```
 oidc/config_name_here/
 ```
 ## Examples
 ### Google Cloud Run -> Amazon Bedrock
 ```yaml
 model_list:
  - model_name: claude-3-haiku-20240307
    litellm_params:
      model: bedrock/anthropic.claude-3-haiku-20240307-v1:0
      aws_region_name: us-west-2
      aws_session_name: "litellm"
      aws_role_name: "arn:aws:iam::YOUR_THING_HERE:role/litellm-google-demo"
      aws_web_identity_token: "oidc/google/https://example.com"
 ```
 ### CircleCI v2 -> Amazon Bedrock
 ```yaml
 model_list:
  - model_name: command-r
    litellm_params:
      model: bedrock/cohere.command-r-v1:0
      aws_region_name: us-west-2
      aws_session_name: "my-test-session"
      aws_role_name: "arn:aws:iam::335785316107:role/litellm-github-unit-tests-circleci"
      aws_web_identity_token: "oidc/circleci_v2/"
 ```
 #### Amazon IAM Role Configuration for CircleCI v2 -> Bedrock
 The configuration below is only an example. You should adjust the permissions and trust relationship to match your specific use case.
 Permissions:
 ```json
 {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Sid": "VisualEditor0",
            "Effect": "Allow",
            "Action": [
                "bedrock:InvokeModel",
                "bedrock:InvokeModelWithResponseStream"
            ],
            "Resource": [
                "arn:aws:bedrock:*::foundation-model/anthropic.claude-3-haiku-20240307-v1:0",
                "arn:aws:bedrock:*::foundation-model/cohere.command-r-v1:0"
            ]
        }
    ]
 }
 ```
 See https://docs.aws.amazon.com/bedrock/latest/userguide/security_iam_id-based-policy-examples.html for more examples. 
 Trust Relationship:
 ```json
 {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Principal": {
                "Federated": "arn:aws:iam::335785316107:oidc-provider/oidc.circleci.com/org/c5a99188-154f-4f69-8da2-b442b1bf78dd"
            },
            "Action": "sts:AssumeRoleWithWebIdentity",
            "Condition": {
                "StringEquals": {
                    "oidc.circleci.com/org/c5a99188-154f-4f69-8da2-b442b1bf78dd:aud": "c5a99188-154f-4f69-8da2-b442b1bf78dd"
                },
                "ForAnyValue:StringLike": {
                    "oidc.circleci.com/org/c5a99188-154f-4f69-8da2-b442b1bf78dd:sub": [
                        "org/c5a99188-154f-4f69-8da2-b442b1bf78dd/project/*/user/*/vcs-origin/github.com/BerriAI/litellm/vcs-ref/refs/heads/main",
                        "org/c5a99188-154f-4f69-8da2-b442b1bf78dd/project/*/user/*/vcs-origin/github.com/BerriAI/litellm/vcs-ref/refs/heads/litellm_*"
                    ]
                }
            }
        }
    ]
 }
 ```
 This trust relationship restricts CircleCI to only assume the role on the main branch and branches that start with `litellm_`.
 For CircleCI (v1 and v2), you also need to add your organization's OIDC provider in your AWS IAM settings. See https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_create_for-idp_oidc.html for more information.
 :::tip
 You should _never_ need to create an IAM user. If you did, you're not using OIDC correctly. You should only be creating a role with permissions and a trust relationship to your OIDC provider.
 :::
 ### Google Cloud Run -> Azure OpenAI
 ```yaml
 model_list:
  - model_name: gpt-4o-2024-05-13
    litellm_params:
      model: azure/gpt-4o-2024-05-13
      azure_ad_token: "oidc/google/https://example.com"
      api_version: "2024-06-01"
      api_base: "https://demo-here.openai.azure.com"
    model_info:
      base_model: azure/gpt-4o-2024-05-13
 ```
 For Azure OpenAI, you need to define `AZURE_CLIENT_ID`, `AZURE_TENANT_ID`, and optionally `AZURE_AUTHORITY_HOST` in your environment.
 ```bash
 export AZURE_CLIENT_ID="91a43c21-cf21-4f34-9085-331015ea4f91" # Azure AD Application (Client) ID
 export AZURE_TENANT_ID="f3b1cf79-eba8-40c3-8120-cb26aca169c2" # Will be the same across of all your Azure AD applications
 export AZURE_AUTHORITY_HOST="https://login.microsoftonline.com" # 👈 Optional, defaults to "https://login.microsoftonline.com"
 ```
 :::tip
 You can find `AZURE_CLIENT_ID` by visiting `https://login.microsoftonline.com/YOUR_DOMAIN_HERE/v2.0/.well-known/openid-configuration` and looking for the UUID in the `issuer` field.
 :::
 :::tip
 Don't set `AZURE_AUTHORITY_HOST` in your environment unless you need to override the default value. This way, if the default value changes in the future, you won't need to update your environment.
 :::
 :::tip
 By default, Azure AD applications use the audience `api://AzureADTokenExchange`. We recommend setting the audience to something more specific to your application.
 :::
 #### Azure AD Application Configuration
 Unfortunately, Azure is bit more complicated to set up than other OIDC relying parties like AWS. Basically, you have to:
 1. Create an Azure application.
 2. Add a federated credential for the OIDC IdP you're using (e.g. Google Cloud Run).
 3. Add the Azure application to resource group that contains the Azure OpenAI resource(s).
 4. Give the Azure application the necessary role to access the Azure OpenAI resource(s).
 The custom role below is the recommended minimum permissions for the Azure application to access Azure OpenAI resources. You should adjust the permissions to match your specific use case.
 ```json
 {
    "id": "/subscriptions/24ebb700-ec2f-417f-afad-78fe15dcc91f/providers/Microsoft.Authorization/roleDefinitions/baf42808-99ff-466d-b9da-f95bb0422c5f",
    "properties": {
        "roleName": "invoke-only",
        "description": "",
        "assignableScopes": [
            "/subscriptions/24ebb700-ec2f-417f-afad-78fe15dcc91f/resourceGroups/your-openai-group-name"
        ],
        "permissions": [
            {
                "actions": [],
                "notActions": [],
                "dataActions": [
                    "Microsoft.CognitiveServices/accounts/OpenAI/deployments/audio/action",
                    "Microsoft.CognitiveServices/accounts/OpenAI/deployments/search/action",
                    "Microsoft.CognitiveServices/accounts/OpenAI/deployments/completions/action",
                    "Microsoft.CognitiveServices/accounts/OpenAI/deployments/chat/completions/action",
                    "Microsoft.CognitiveServices/accounts/OpenAI/deployments/extensions/chat/completions/action",
                    "Microsoft.CognitiveServices/accounts/OpenAI/deployments/embeddings/action",
                    "Microsoft.CognitiveServices/accounts/OpenAI/images/generations/action"
                ],
                "notDataActions": []
            }
        ]
    }
 }
 ```
 _Note: Your UUIDs will be different._
 Please contact us for paid enterprise support if you need help setting up Azure AD applications.
--- a/docs/my-website/docs/providers/anthropic.md
+++ b/docs/my-website/docs/providers/anthropic.md
@ -56,7 +56,7 @@ for chunk in response:
    print(chunk["choices"][0]["delta"]["content"])  # same as openai format
 ```
-## OpenAI Proxy Usage 
+## Usage with LiteLLM Proxy 
 Here's how to call Anthropic with the LiteLLM Proxy Server
@ -69,14 +69,6 @@ export ANTHROPIC_API_KEY="your-api-key"
 ### 2. Start the proxy 
 <Tabs>
 <TabItem value="cli" label="cli">
 ```bash
 $ litellm --model claude-3-opus-20240229
 # Server running on http://0.0.0.0:4000
 ```
 </TabItem>
 <TabItem value="config" label="config.yaml">
 ```yaml
@ -91,6 +83,14 @@ model_list:
 litellm --config /path/to/config.yaml
 ```
 </TabItem>
 <TabItem value="cli" label="cli">
 ```bash
 $ litellm --model claude-3-opus-20240229
 # Server running on http://0.0.0.0:4000
 ```
 </TabItem>
 </Tabs>
 ### 3. Test it
--- a/docs/my-website/docs/providers/friendliai.md
+++ b/docs/my-website/docs/providers/friendliai.md
@ -0,0 +1,60 @@
 # FriendliAI
 https://suite.friendli.ai/
 **We support ALL FriendliAI models, just set `friendliai/` as a prefix when sending completion requests**
 ## API Key
 ```python
 # env variable
 os.environ['FRIENDLI_TOKEN']
 os.environ['FRIENDLI_API_BASE'] # Optional. Set this when using dedicated endpoint.
 ```
 ## Sample Usage
 ```python
 from litellm import completion
 import os
 os.environ['FRIENDLI_TOKEN'] = ""
 response = completion(
    model="friendliai/mixtral-8x7b-instruct-v0-1", 
    messages=[
       {"role": "user", "content": "hello from litellm"}
   ],
 )
 print(response)
 ```
 ## Sample Usage - Streaming
 ```python
 from litellm import completion
 import os
 os.environ['FRIENDLI_TOKEN'] = ""
 response = completion(
    model="friendliai/mixtral-8x7b-instruct-v0-1", 
    messages=[
       {"role": "user", "content": "hello from litellm"}
   ],
    stream=True
 )
 for chunk in response:
    print(chunk)
 ```
 ## Supported Models
 ### Serverless Endpoints
 We support ALL FriendliAI AI models, just set `friendliai/` as a prefix when sending completion requests
 | Model Name               | Function Call                                                                                                                                                      |
 |--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | mixtral-8x7b-instruct | `completion(model="friendliai/mixtral-8x7b-instruct-v0-1", messages)` | 
 | meta-llama-3-8b-instruct | `completion(model="friendliai/meta-llama-3-8b-instruct", messages)` |
 | meta-llama-3-70b-instruct | `completion(model="friendliai/meta-llama-3-70b-instruct", messages)` |  
 ### Dedicated Endpoints
 ```
 model="friendliai/$ENDPOINT_ID:$ADAPTER_ROUTE"
 ```
--- a/docs/my-website/docs/providers/groq.md
+++ b/docs/my-website/docs/providers/groq.md
@ -1,3 +1,6 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # Groq
 https://groq.com/
@ -20,7 +23,7 @@ import os
 os.environ['GROQ_API_KEY'] = ""
 response = completion(
-    model="groq/llama2-70b-4096", 
+    model="groq/llama3-8b-8192", 
    messages=[
       {"role": "user", "content": "hello from litellm"}
   ],
@ -35,7 +38,7 @@ import os
 os.environ['GROQ_API_KEY'] = ""
 response = completion(
-    model="groq/llama2-70b-4096", 
+    model="groq/llama3-8b-8192", 
    messages=[
       {"role": "user", "content": "hello from litellm"}
   ],
@ -47,6 +50,101 @@ for chunk in response:
 ```
 ## Usage with LiteLLM Proxy 
 ### 1. Set Groq Models on config.yaml
 ```yaml
 model_list:
  - model_name: groq-llama3-8b-8192 # Model Alias to use for requests
    litellm_params:
      model: groq/llama3-8b-8192
      api_key: "os.environ/GROQ_API_KEY" # ensure you have `GROQ_API_KEY` in your .env
 ```
 ### 2. Start Proxy 
 ```
 litellm --config config.yaml
 ```
 ### 3. Test it
 Make request to litellm proxy
 <Tabs>
 <TabItem value="Curl" label="Curl Request">
 ```shell
 curl --location 'http://0.0.0.0:4000/chat/completions' \
 --header 'Content-Type: application/json' \
 --data ' {
      "model": "groq-llama3-8b-8192",
      "messages": [
        {
          "role": "user",
          "content": "what llm are you"
        }
      ]
    }
 '
 ```
 </TabItem>
 <TabItem value="openai" label="OpenAI v1.0.0+">
 ```python
 import openai
 client = openai.OpenAI(
    api_key="anything",
    base_url="http://0.0.0.0:4000"
 )
 response = client.chat.completions.create(model="groq-llama3-8b-8192", messages = [
    {
        "role": "user",
        "content": "this is a test request, write a short poem"
    }
 ])
 print(response)
 ```
 </TabItem>
 <TabItem value="langchain" label="Langchain">
 ```python
 from langchain.chat_models import ChatOpenAI
 from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
 )
 from langchain.schema import HumanMessage, SystemMessage
 chat = ChatOpenAI(
    openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
    model = "groq-llama3-8b-8192",
    temperature=0.1
 )
 messages = [
    SystemMessage(
        content="You are a helpful assistant that im using to make a test request to."
    ),
    HumanMessage(
        content="test from litellm. tell me why it's amazing in 1 sentence"
    ),
 ]
 response = chat(messages)
 print(response)
 ```
 </TabItem>
 </Tabs>
 ## Supported Models - ALL Groq Models Supported!
 We support ALL Groq models, just set `groq/` as a prefix when sending completion requests
@ -114,7 +212,7 @@ tools = [
    }
 ]
 response = litellm.completion(
-    model="groq/llama2-70b-4096",
+    model="groq/llama3-8b-8192",
    messages=messages,
    tools=tools,
    tool_choice="auto",  # auto is default, but we'll be explicit
@ -154,7 +252,7 @@ if tool_calls:
        )  # extend conversation with function response
    print(f"messages: {messages}")
    second_response = litellm.completion(
-        model="groq/llama2-70b-4096", messages=messages
+        model="groq/llama3-8b-8192", messages=messages
    )  # get a new response from the model where it can see the function response
    print("second response\n", second_response)
 ```
--- a/docs/my-website/docs/providers/vertex.md
+++ b/docs/my-website/docs/providers/vertex.md
@ -749,6 +749,85 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
 </TabItem>
 </Tabs>
 ## Llama 3 API
 | Model Name       | Function Call                        |
 |------------------|--------------------------------------|
 | meta/llama3-405b-instruct-maas   | `completion('vertex_ai/meta/llama3-405b-instruct-maas', messages)` |
 ### Usage
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 from litellm import completion
 import os
 os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = ""
 model = "meta/llama3-405b-instruct-maas"
 vertex_ai_project = "your-vertex-project" # can also set this as os.environ["VERTEXAI_PROJECT"]
 vertex_ai_location = "your-vertex-location" # can also set this as os.environ["VERTEXAI_LOCATION"]
 response = completion(
    model="vertex_ai/" + model,
    messages=[{"role": "user", "content": "hi"}],
    temperature=0.7,
    vertex_ai_project=vertex_ai_project,
    vertex_ai_location=vertex_ai_location,
 )
 print("\nModel Response", response)
 ```
 </TabItem>
 <TabItem value="proxy" label="Proxy">
 **1. Add to config**
 ```yaml
 model_list:
    - model_name: anthropic-llama
      litellm_params:
        model: vertex_ai/meta/llama3-405b-instruct-maas
        vertex_ai_project: "my-test-project"
        vertex_ai_location: "us-east-1"
    - model_name: anthropic-llama
      litellm_params:
        model: vertex_ai/meta/llama3-405b-instruct-maas
        vertex_ai_project: "my-test-project"
        vertex_ai_location: "us-west-1"
 ```
 **2. Start proxy**
 ```bash
 litellm --config /path/to/config.yaml
 # RUNNING at http://0.0.0.0:4000
 ```
 **3. Test it!**
 ```bash
 curl --location 'http://0.0.0.0:4000/chat/completions' \
      --header 'Authorization: Bearer sk-1234' \
      --header 'Content-Type: application/json' \
      --data '{
            "model": "anthropic-llama", # 👈 the 'model_name' in config
            "messages": [
                {
                "role": "user",
                "content": "what llm are you"
                }
            ],
        }'
 ```
 </TabItem>
 </Tabs>
 ## Model Garden
 | Model Name       | Function Call                        |
 |------------------|--------------------------------------|
--- a/docs/my-website/docs/proxy/alerting.md
+++ b/docs/my-website/docs/proxy/alerting.md
@ -119,8 +119,8 @@ All Possible Alert Types
 ```python
 AlertType = Literal[
-    "llm_exceptions",
+    "llm_exceptions",        # LLM API Exceptions
-    "llm_too_slow",
+    "llm_too_slow",          # LLM Responses slower than alerting_threshold
    "llm_requests_hanging",
    "budget_alerts",
    "db_exceptions",
@ -133,6 +133,61 @@ AlertType = Literal[
 ```
 ## Advanced - set specific slack channels per alert type
 Use this if you want to set specific channels per alert type
 **This allows you to do the following**
 ```
 llm_exceptions -> go to slack channel #llm-exceptions
 spend_reports -> go to slack channel #llm-spend-reports
 ```
 Set `alert_to_webhook_url` on your config.yaml
 ```yaml
 model_list:
  - model_name: gpt-4
    litellm_params:
      model: openai/fake
      api_key: fake-key
      api_base: https://exampleopenaiendpoint-production.up.railway.app/
 general_settings: 
  master_key: sk-1234
  alerting: ["slack"]
  alerting_threshold: 0.0001 # (Seconds) set an artifically low threshold for testing alerting
  alert_to_webhook_url: {
    "llm_exceptions": "https://hooks.slack.com/services/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",
    "llm_too_slow": "https://hooks.slack.com/services/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",
    "llm_requests_hanging": "https://hooks.slack.com/services/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",
    "budget_alerts": "https://hooks.slack.com/services/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",
    "db_exceptions": "https://hooks.slack.com/services/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",
    "daily_reports": "https://hooks.slack.com/services/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",
    "spend_reports": "https://hooks.slack.com/services/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",
    "cooldown_deployment": "https://hooks.slack.com/services/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",
    "new_model_added": "https://hooks.slack.com/services/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",
    "outage_alerts": "https://hooks.slack.com/services/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",
  }
 litellm_settings:
  success_callback: ["langfuse"]
 ```
 Test it - send a valid llm request - expect to see a `llm_too_slow` alert in it's own slack channel
 ```shell
 curl -i http://localhost:4000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer sk-1234" \
  -d '{
    "model": "gpt-4",
    "messages": [
      {"role": "user", "content": "Hello, Claude gm!"}
    ]
 }'
 ```
 ## Advanced - Using MS Teams Webhooks
--- a/docs/my-website/docs/proxy/guardrails.md
+++ b/docs/my-website/docs/proxy/guardrails.md
@ -266,6 +266,54 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
 }'
 ```
 ## Disable team from turning on/off guardrails
 ### 1. Disable team from modifying guardrails 
 ```bash
 curl -X POST 'http://0.0.0.0:4000/team/update' \
 -H 'Authorization: Bearer sk-1234' \
 -H 'Content-Type: application/json' \
 -D '{
    "team_id": "4198d93c-d375-4c83-8d5a-71e7c5473e50",
    "metadata": {"guardrails": {"modify_guardrails": false}}
 }'
 ```
 ### 2. Try to disable guardrails for a call 
 ```bash
 curl --location 'http://0.0.0.0:4000/chat/completions' \
 --header 'Content-Type: application/json' \
 --header 'Authorization: Bearer $LITELLM_VIRTUAL_KEY' \
 --data '{
 "model": "gpt-3.5-turbo",
    "messages": [
      {
        "role": "user",
        "content": "Think of 10 random colors."
      }
    ],
    "metadata": {"guardrails": {"hide_secrets": false}}
 }'
 ```
 ### 3. Get 403 Error
 ```
 {
    "error": {
        "message": {
            "error": "Your team does not have permission to modify guardrails."
        },
        "type": "auth_error",
        "param": "None",
        "code": 403
    }
 }
 ```
 Expect to NOT see `+1 412-612-9992` in your server logs on your callback. 
 :::info
--- a/docs/my-website/docs/proxy/logging.md
+++ b/docs/my-website/docs/proxy/logging.md
@ -48,6 +48,20 @@ A number of these headers could be useful for troubleshooting, but the
 `x-litellm-call-id` is the one that is most useful for tracking a request across
 components in your system, including in logging tools.
 ## Redacting UserAPIKeyInfo 
 Redact information about the user api key (hashed token, user_id, team id, etc.), from logs. 
 Currently supported for Langfuse, OpenTelemetry, Logfire, ArizeAI logging.
 ```yaml
 litellm_settings: 
  callbacks: ["langfuse"]
  redact_user_api_key_info: true
 ```
 Removes any field with `user_api_key_*` from metadata.
 ## Logging Proxy Input/Output - Langfuse
 We will use the `--config` to set `litellm.success_callback = ["langfuse"]` this will log all successfull LLM calls to langfuse. Make sure to set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` in your environment
@ -202,6 +216,9 @@ print(response)
 ### Team based Logging to Langfuse
 [👉 Tutorial - Allow each team to use their own Langfuse Project / custom callbacks](team_logging)
 <!-- 
 **Example:**
 This config would send langfuse logs to 2 different langfuse projects, based on the team id 
@ -228,7 +245,7 @@ curl -X POST 'http://0.0.0.0:4000/key/generate' \
 -d '{"team_id": "ishaans-secret-project"}'
 ```
-All requests made with these keys will log data to their team-specific logging.
+All requests made with these keys will log data to their team-specific logging. -->
 ### Redacting Messages, Response Content from Langfuse Logging 
@ -1106,6 +1123,52 @@ environment_variables:
 ```
 2. Start Proxy
 ```
 litellm --config /path/to/config.yaml
 ```
 3. Test it! 
 ```bash
 curl --location 'http://0.0.0.0:4000/chat/completions' \
 --header 'Content-Type: application/json' \
 --data ' {
      "model": "fake-openai-endpoint",
      "messages": [
        {
          "role": "user",
          "content": "Hello, Claude gm!"
        }
      ],
    }
 '
 ```
 Expect to see your log on Langfuse
 <Image img={require('../../img/langsmith_new.png')} />
 ## Logging LLM IO to Arize AI
 1. Set `success_callback: ["arize"]` on litellm config.yaml
 ```yaml
 model_list:
  - model_name: gpt-4
    litellm_params:
      model: openai/fake
      api_key: fake-key
      api_base: https://exampleopenaiendpoint-production.up.railway.app/
 litellm_settings:
  callbacks: ["arize"]
 environment_variables:
    ARIZE_SPACE_KEY: "d0*****"
    ARIZE_API_KEY: "141a****"
 ```
 2. Start Proxy
 ```
--- a/docs/my-website/docs/proxy/team_based_routing.md
+++ b/docs/my-website/docs/proxy/team_based_routing.md
@ -70,3 +70,42 @@ curl --location 'http://0.0.0.0:4000/v1/chat/completions' \
  "user": "usha"
 }'
 ```
 ## Team Based Logging
 [👉 Tutorial - Allow each team to use their own Langfuse Project / custom callbacks](team_logging.md)
 <!-- 
 ## Logging / Caching
 Turn on/off logging and caching for a specific team id. 
 **Example:**
 This config would send langfuse logs to 2 different langfuse projects, based on the team id 
 ```yaml
 litellm_settings:
  default_team_settings: 
    - team_id: my-secret-project
      success_callback: ["langfuse"]
      langfuse_public_key: os.environ/LANGFUSE_PUB_KEY_1 # Project 1
      langfuse_secret: os.environ/LANGFUSE_PRIVATE_KEY_1 # Project 1
    - team_id: ishaans-secret-project
      success_callback: ["langfuse"]
      langfuse_public_key: os.environ/LANGFUSE_PUB_KEY_2 # Project 2
      langfuse_secret: os.environ/LANGFUSE_SECRET_2 # Project 2
 ```
 Now, when you [generate keys](./virtual_keys.md) for this team-id 
 ```bash
 curl -X POST 'http://0.0.0.0:4000/key/generate' \
 -H 'Authorization: Bearer sk-1234' \
 -H 'Content-Type: application/json' \
 -d '{"team_id": "ishaans-secret-project"}'
 ```
 All requests made with these keys will log data to their team-specific logging. -->
--- a/docs/my-website/docs/proxy/team_logging.md
+++ b/docs/my-website/docs/proxy/team_logging.md
@ -0,0 +1,144 @@
 import Image from '@theme/IdealImage';
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # 👥📊 Team Based Logging
 Allow each team to use their own Langfuse Project / custom callbacks
 **This allows you to do the following**
 ```
 Team 1 -> Logs to Langfuse Project 1 
 Team 2 -> Logs to Langfuse Project 2
 Team 3 -> Disabled Logging (for GDPR compliance)
 ```
 ## Set Callbacks Per Team
 ### 1. Set callback for team 
 We make a request to `POST /team/{team_id}/callback` to add a callback for
 ```shell
 curl -X POST 'http:/localhost:4000/team/dbe2f686-a686-4896-864a-4c3924458709/callback' \
 -H 'Content-Type: application/json' \
 -H 'Authorization: Bearer sk-1234' \
 -d '{
  "callback_name": "langfuse",
  "callback_type": "success",
  "callback_vars": {
    "langfuse_public_key": "pk", 
    "langfuse_secret_key": "sk_", 
    "langfuse_host": "https://cloud.langfuse.com"
    }
 }'
 ```
 #### Supported Values
 | Field | Supported Values | Notes |
 |-------|------------------|-------|
 | `callback_name` | `"langfuse"` | Currently only supports "langfuse" |
 | `callback_type` | `"success"`, `"failure"`, `"success_and_failure"` | |
 | `callback_vars` | | dict of callback settings |
 | &nbsp;&nbsp;&nbsp;&nbsp;`langfuse_public_key` | string | Required |
 | &nbsp;&nbsp;&nbsp;&nbsp;`langfuse_secret_key` | string | Required |
 | &nbsp;&nbsp;&nbsp;&nbsp;`langfuse_host` | string | Optional (defaults to https://cloud.langfuse.com) |
 ### 2. Create key for team
 All keys created for team `dbe2f686-a686-4896-864a-4c3924458709` will log to langfuse project specified on [Step 1. Set callback for team](#1-set-callback-for-team)
 ```shell
 curl --location 'http://0.0.0.0:4000/key/generate' \
    --header 'Authorization: Bearer sk-1234' \
    --header 'Content-Type: application/json' \
    --data '{
        "team_id": "dbe2f686-a686-4896-864a-4c3924458709"
 }'
 ```
 ### 3. Make `/chat/completion` request for team
 ```shell
 curl -i http://localhost:4000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer sk-KbUuE0WNptC0jXapyMmLBA" \
  -d '{
    "model": "gpt-4",
    "messages": [
      {"role": "user", "content": "Hello, Claude gm!"}
    ]
 }'
 ```
 Expect this to be logged on the langfuse project specified on [Step 1. Set callback for team](#1-set-callback-for-team)
 ## Disable Logging for a Team
 To disable logging for a specific team, you can use the following endpoint:
 `POST /team/{team_id}/disable_logging`
 This endpoint removes all success and failure callbacks for the specified team, effectively disabling logging.
 ### Step 1. Disable logging for team
 ```shell
 curl -X POST 'http://localhost:4000/team/YOUR_TEAM_ID/disable_logging' \
    -H 'Authorization: Bearer YOUR_API_KEY'
 ```
 Replace YOUR_TEAM_ID with the actual team ID
 **Response**
 A successful request will return a response similar to this:
 ```json
 {
    "status": "success",
    "message": "Logging disabled for team YOUR_TEAM_ID",
    "data": {
        "team_id": "YOUR_TEAM_ID",
        "success_callbacks": [],
        "failure_callbacks": []
    }
 }
 ```
 ### Step 2. Test it - `/chat/completions`
 Use a key generated for team = `team_id` - you should see no logs on your configured success callback (eg. Langfuse)
 ```shell
 curl -i http://localhost:4000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer sk-KbUuE0WNptC0jXapyMmLBA" \
  -d '{
    "model": "gpt-4",
    "messages": [
      {"role": "user", "content": "Hello, Claude gm!"}
    ]
 }'
 ```
 ### Debugging / Troubleshooting
 - Check active callbacks for team using `GET /team/{team_id}/callback`
 Use this to check what success/failure callbacks are active for team=`team_id`
 ```shell
 curl -X GET 'http://localhost:4000/team/dbe2f686-a686-4896-864a-4c3924458709/callback' \
        -H 'Authorization: Bearer sk-1234'
 ```
 ## Team Logging Endpoints
 - [`POST /team/{team_id}/callback` Add a success/failure callback to a team](https://litellm-api.up.railway.app/#/team%20management/add_team_callbacks_team__team_id__callback_post)
 - [`GET /team/{team_id}/callback` - Get the success/failure callbacks and variables for a team](https://litellm-api.up.railway.app/#/team%20management/get_team_callbacks_team__team_id__callback_get)
--- a/docs/my-website/img/raw_response_headers.png
+++ b/docs/my-website/img/raw_response_headers.png
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -44,19 +44,20 @@ const sidebars = {
        "proxy/cost_tracking",
        "proxy/self_serve",
        "proxy/virtual_keys",
        "proxy/tag_routing",
        "proxy/users",
        "proxy/team_budgets",
        "proxy/customers",
        "proxy/billing",
        "proxy/guardrails",
        "proxy/token_auth",
        "proxy/alerting",
        {
          type: "category",
          label: "🪢 Logging",
          items: ["proxy/logging", "proxy/streaming_logging"],
        },
        "proxy/team_logging",
        "proxy/guardrails",
        "proxy/tag_routing",
        "proxy/users",
        "proxy/team_budgets",
        "proxy/customers",
        "proxy/billing",
        "proxy/token_auth",
        "proxy/alerting",
        "proxy/ui",
        "proxy/prometheus",
        "proxy/pass_through",
@ -157,6 +158,7 @@ const sidebars = {
        "providers/triton-inference-server",
        "providers/ollama", 
        "providers/perplexity", 
        "providers/friendliai",
        "providers/groq", 
        "providers/deepseek", 
        "providers/fireworks_ai",
@ -183,7 +185,14 @@ const sidebars = {
    "scheduler",
    "set_keys",
    "budget_manager",
-    "secret",
+    {
      type: "category", 
      label: "Secret Manager", 
      items: [
        "secret", 
        "oidc"
      ]
    },
    "completion/token_usage",
    "load_test",
    {
@ -192,17 +201,19 @@ const sidebars = {
      items: [
        "observability/langfuse_integration",
        "observability/logfire_integration",
        "observability/langsmith_integration",
        "observability/arize_integration",
        "debugging/local_debugging",
        "observability/raw_request_response",
        "observability/custom_callback",
        "observability/scrub_data",
-        "observability/helicone_integration",
+        "observability/braintrust",
        "observability/sentry",
        "observability/lago",
        "observability/helicone_integration",
        "observability/openmeter",
        "observability/promptlayer_integration",
        "observability/wandb_integration",
        "observability/langsmith_integration",
        "observability/slack_integration",
        "observability/traceloop_integration",
        "observability/athina_integration",
--- a/litellm/init.py
+++ b/litellm/init.py
@ -4,7 +4,7 @@ import warnings
 warnings.filterwarnings("ignore", message=".*conflict with protected namespace.*")
 ### INIT VARIABLES ###
 import threading, requests, os
-from typing import Callable, List, Optional, Dict, Union, Any, Literal
+from typing import Callable, List, Optional, Dict, Union, Any, Literal, get_args
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.caching import Cache
 from litellm._logging import (
@ -38,8 +38,18 @@ success_callback: List[Union[str, Callable]] = []
 failure_callback: List[Union[str, Callable]] = []
 service_callback: List[Union[str, Callable]] = []
 _custom_logger_compatible_callbacks_literal = Literal[
-    "lago", "openmeter", "logfire", "dynamic_rate_limiter", "langsmith", "galileo"
+    "lago",
    "openmeter",
    "logfire",
    "dynamic_rate_limiter",
    "langsmith",
    "galileo",
    "braintrust",
    "arize",
 ]
 _known_custom_logger_compatible_callbacks: List = list(
    get_args(_custom_logger_compatible_callbacks_literal)
 )
 callbacks: List[Union[Callable, _custom_logger_compatible_callbacks_literal]] = []
 _langfuse_default_tags: Optional[
    List[
@ -67,6 +77,7 @@ post_call_rules: List[Callable] = []
 turn_off_message_logging: Optional[bool] = False
 log_raw_request_response: bool = False
 redact_messages_in_exceptions: Optional[bool] = False
 redact_user_api_key_info: Optional[bool] = False
 store_audit_logs = False  # Enterprise feature, allow users to see audit logs
 ## end of callbacks #############
@ -346,6 +357,7 @@ vertex_text_models: List = []
 vertex_code_text_models: List = []
 vertex_embedding_models: List = []
 vertex_anthropic_models: List = []
 vertex_llama3_models: List = []
 ai21_models: List = []
 nlp_cloud_models: List = []
 aleph_alpha_models: List = []
@ -388,6 +400,9 @@ for key, value in model_cost.items():
    elif value.get("litellm_provider") == "vertex_ai-anthropic_models":
        key = key.replace("vertex_ai/", "")
        vertex_anthropic_models.append(key)
    elif value.get("litellm_provider") == "vertex_ai-llama_models":
        key = key.replace("vertex_ai/", "")
        vertex_llama3_models.append(key)
    elif value.get("litellm_provider") == "ai21":
        ai21_models.append(key)
    elif value.get("litellm_provider") == "nlp_cloud":
@ -817,6 +832,7 @@ from .llms.petals import PetalsConfig
 from .llms.vertex_httpx import VertexGeminiConfig, GoogleAIStudioGeminiConfig
 from .llms.vertex_ai import VertexAIConfig, VertexAITextEmbeddingConfig
 from .llms.vertex_ai_anthropic import VertexAIAnthropicConfig
 from .llms.vertex_ai_llama import VertexAILlama3Config
 from .llms.sagemaker import SagemakerConfig
 from .llms.ollama import OllamaConfig
 from .llms.ollama_chat import OllamaChatConfig
@ -872,6 +888,7 @@ from .exceptions import (
    APIError,
    Timeout,
    APIConnectionError,
    UnsupportedParamsError,
    APIResponseValidationError,
    UnprocessableEntityError,
    InternalServerError,
--- a/litellm/exceptions.py
+++ b/litellm/exceptions.py
@ -682,11 +682,39 @@ class JSONSchemaValidationError(APIError):
        )
 class UnsupportedParamsError(BadRequestError):
    def __init__(
        self,
        message,
        llm_provider: Optional[str] = None,
        model: Optional[str] = None,
        status_code: int = 400,
        response: Optional[httpx.Response] = None,
        litellm_debug_info: Optional[str] = None,
        max_retries: Optional[int] = None,
        num_retries: Optional[int] = None,
    ):
        self.status_code = 400
        self.message = "litellm.UnsupportedParamsError: {}".format(message)
        self.model = model
        self.llm_provider = llm_provider
        self.litellm_debug_info = litellm_debug_info
        response = response or httpx.Response(
            status_code=self.status_code,
            request=httpx.Request(
                method="GET", url="https://litellm.ai"
            ),  # mock request object
        )
        self.max_retries = max_retries
        self.num_retries = num_retries
 LITELLM_EXCEPTION_TYPES = [
    AuthenticationError,
    NotFoundError,
    BadRequestError,
    UnprocessableEntityError,
    UnsupportedParamsError,
    Timeout,
    PermissionDeniedError,
    RateLimitError,
--- a/litellm/integrations/_types/open_inference.py
+++ b/litellm/integrations/_types/open_inference.py
@ -0,0 +1,286 @@
 from enum import Enum
 class SpanAttributes:
    OUTPUT_VALUE = "output.value"
    OUTPUT_MIME_TYPE = "output.mime_type"
    """
    The type of output.value. If unspecified, the type is plain text by default.
    If type is JSON, the value is a string representing a JSON object.
    """
    INPUT_VALUE = "input.value"
    INPUT_MIME_TYPE = "input.mime_type"
    """
    The type of input.value. If unspecified, the type is plain text by default.
    If type is JSON, the value is a string representing a JSON object.
    """
    EMBEDDING_EMBEDDINGS = "embedding.embeddings"
    """
    A list of objects containing embedding data, including the vector and represented piece of text.
    """
    EMBEDDING_MODEL_NAME = "embedding.model_name"
    """
    The name of the embedding model.
    """
    LLM_FUNCTION_CALL = "llm.function_call"
    """
    For models and APIs that support function calling. Records attributes such as the function
    name and arguments to the called function.
    """
    LLM_INVOCATION_PARAMETERS = "llm.invocation_parameters"
    """
    Invocation parameters passed to the LLM or API, such as the model name, temperature, etc.
    """
    LLM_INPUT_MESSAGES = "llm.input_messages"
    """
    Messages provided to a chat API.
    """
    LLM_OUTPUT_MESSAGES = "llm.output_messages"
    """
    Messages received from a chat API.
    """
    LLM_MODEL_NAME = "llm.model_name"
    """
    The name of the model being used.
    """
    LLM_PROMPTS = "llm.prompts"
    """
    Prompts provided to a completions API.
    """
    LLM_PROMPT_TEMPLATE = "llm.prompt_template.template"
    """
    The prompt template as a Python f-string.
    """
    LLM_PROMPT_TEMPLATE_VARIABLES = "llm.prompt_template.variables"
    """
    A list of input variables to the prompt template.
    """
    LLM_PROMPT_TEMPLATE_VERSION = "llm.prompt_template.version"
    """
    The version of the prompt template being used.
    """
    LLM_TOKEN_COUNT_PROMPT = "llm.token_count.prompt"
    """
    Number of tokens in the prompt.
    """
    LLM_TOKEN_COUNT_COMPLETION = "llm.token_count.completion"
    """
    Number of tokens in the completion.
    """
    LLM_TOKEN_COUNT_TOTAL = "llm.token_count.total"
    """
    Total number of tokens, including both prompt and completion.
    """
    TOOL_NAME = "tool.name"
    """
    Name of the tool being used.
    """
    TOOL_DESCRIPTION = "tool.description"
    """
    Description of the tool's purpose, typically used to select the tool.
    """
    TOOL_PARAMETERS = "tool.parameters"
    """
    Parameters of the tool represented a dictionary JSON string, e.g.
    see https://platform.openai.com/docs/guides/gpt/function-calling
    """
    RETRIEVAL_DOCUMENTS = "retrieval.documents"
    METADATA = "metadata"
    """
    Metadata attributes are used to store user-defined key-value pairs.
    For example, LangChain uses metadata to store user-defined attributes for a chain.
    """
    TAG_TAGS = "tag.tags"
    """
    Custom categorical tags for the span.
    """
    OPENINFERENCE_SPAN_KIND = "openinference.span.kind"
    SESSION_ID = "session.id"
    """
    The id of the session
    """
    USER_ID = "user.id"
    """
    The id of the user
    """
 class MessageAttributes:
    """
    Attributes for a message sent to or from an LLM
    """
    MESSAGE_ROLE = "message.role"
    """
    The role of the message, such as "user", "agent", "function".
    """
    MESSAGE_CONTENT = "message.content"
    """
    The content of the message to or from the llm, must be a string.
    """
    MESSAGE_CONTENTS = "message.contents"
    """
    The message contents to the llm, it is an array of
    `message_content` prefixed attributes.
    """
    MESSAGE_NAME = "message.name"
    """
    The name of the message, often used to identify the function
    that was used to generate the message.
    """
    MESSAGE_TOOL_CALLS = "message.tool_calls"
    """
    The tool calls generated by the model, such as function calls.
    """
    MESSAGE_FUNCTION_CALL_NAME = "message.function_call_name"
    """
    The function name that is a part of the message list.
    This is populated for role 'function' or 'agent' as a mechanism to identify
    the function that was called during the execution of a tool.
    """
    MESSAGE_FUNCTION_CALL_ARGUMENTS_JSON = "message.function_call_arguments_json"
    """
    The JSON string representing the arguments passed to the function
    during a function call.
    """
 class MessageContentAttributes:
    """
    Attributes for the contents of user messages sent to an LLM.
    """
    MESSAGE_CONTENT_TYPE = "message_content.type"
    """
    The type of the content, such as "text" or "image".
    """
    MESSAGE_CONTENT_TEXT = "message_content.text"
    """
    The text content of the message, if the type is "text".
    """
    MESSAGE_CONTENT_IMAGE = "message_content.image"
    """
    The image content of the message, if the type is "image".
    An image can be made available to the model by passing a link to
    the image or by passing the base64 encoded image directly in the
    request.
    """
 class ImageAttributes:
    """
    Attributes for images
    """
    IMAGE_URL = "image.url"
    """
    An http or base64 image url
    """
 class DocumentAttributes:
    """
    Attributes for a document.
    """
    DOCUMENT_ID = "document.id"
    """
    The id of the document.
    """
    DOCUMENT_SCORE = "document.score"
    """
    The score of the document
    """
    DOCUMENT_CONTENT = "document.content"
    """
    The content of the document.
    """
    DOCUMENT_METADATA = "document.metadata"
    """
    The metadata of the document represented as a dictionary
    JSON string, e.g. `"{ 'title': 'foo' }"`
    """
 class RerankerAttributes:
    """
    Attributes for a reranker
    """
    RERANKER_INPUT_DOCUMENTS = "reranker.input_documents"
    """
    List of documents as input to the reranker
    """
    RERANKER_OUTPUT_DOCUMENTS = "reranker.output_documents"
    """
    List of documents as output from the reranker
    """
    RERANKER_QUERY = "reranker.query"
    """
    Query string for the reranker
    """
    RERANKER_MODEL_NAME = "reranker.model_name"
    """
    Model name of the reranker
    """
    RERANKER_TOP_K = "reranker.top_k"
    """
    Top K parameter of the reranker
    """
 class EmbeddingAttributes:
    """
    Attributes for an embedding
    """
    EMBEDDING_TEXT = "embedding.text"
    """
    The text represented by the embedding.
    """
    EMBEDDING_VECTOR = "embedding.vector"
    """
    The embedding vector.
    """
 class ToolCallAttributes:
    """
    Attributes for a tool call
    """
    TOOL_CALL_FUNCTION_NAME = "tool_call.function.name"
    """
    The name of function that is being called during a tool call.
    """
    TOOL_CALL_FUNCTION_ARGUMENTS_JSON = "tool_call.function.arguments"
    """
    The JSON string representing the arguments passed to the function
    during a tool call.
    """
 class OpenInferenceSpanKindValues(Enum):
    TOOL = "TOOL"
    CHAIN = "CHAIN"
    LLM = "LLM"
    RETRIEVER = "RETRIEVER"
    EMBEDDING = "EMBEDDING"
    AGENT = "AGENT"
    RERANKER = "RERANKER"
    UNKNOWN = "UNKNOWN"
    GUARDRAIL = "GUARDRAIL"
    EVALUATOR = "EVALUATOR"
 class OpenInferenceMimeTypeValues(Enum):
    TEXT = "text/plain"
    JSON = "application/json"
--- a/litellm/integrations/arize_ai.py
+++ b/litellm/integrations/arize_ai.py
@ -0,0 +1,114 @@
 """
 arize AI is OTEL compatible
 this file has Arize ai specific helper functions
 """
 from typing import TYPE_CHECKING, Any, Optional, Union
 if TYPE_CHECKING:
    from opentelemetry.trace import Span as _Span
    Span = _Span
 else:
    Span = Any
 def set_arize_ai_attributes(span: Span, kwargs, response_obj):
    from litellm.integrations._types.open_inference import (
        MessageAttributes,
        MessageContentAttributes,
        OpenInferenceSpanKindValues,
        SpanAttributes,
    )
    optional_params = kwargs.get("optional_params", {})
    litellm_params = kwargs.get("litellm_params", {}) or {}
    #############################################
    ############ LLM CALL METADATA ##############
    #############################################
    # commented out for now - looks like Arize AI could not log this
    # metadata = litellm_params.get("metadata", {}) or {}
    # span.set_attribute(SpanAttributes.METADATA, str(metadata))
    #############################################
    ########## LLM Request Attributes ###########
    #############################################
    # The name of the LLM a request is being made to
    if kwargs.get("model"):
        span.set_attribute(SpanAttributes.LLM_MODEL_NAME, kwargs.get("model"))
    span.set_attribute(
        SpanAttributes.OPENINFERENCE_SPAN_KIND, OpenInferenceSpanKindValues.LLM.value
    )
    messages = kwargs.get("messages")
    # for /chat/completions
    # https://docs.arize.com/arize/large-language-models/tracing/semantic-conventions
    if messages:
        span.set_attribute(
            SpanAttributes.INPUT_VALUE,
            messages[-1].get("content", ""),  # get the last message for input
        )
        # LLM_INPUT_MESSAGES shows up under `input_messages` tab on the span page
        for idx, msg in enumerate(messages):
            # Set the role per message
            span.set_attribute(
                f"{SpanAttributes.LLM_INPUT_MESSAGES}.{idx}.{MessageAttributes.MESSAGE_ROLE}",
                msg["role"],
            )
            # Set the content per message
            span.set_attribute(
                f"{SpanAttributes.LLM_INPUT_MESSAGES}.{idx}.{MessageAttributes.MESSAGE_CONTENT}",
                msg.get("content", ""),
            )
    # The Generative AI Provider: Azure, OpenAI, etc.
    span.set_attribute(SpanAttributes.LLM_INVOCATION_PARAMETERS, str(optional_params))
    if optional_params.get("user"):
        span.set_attribute(SpanAttributes.USER_ID, optional_params.get("user"))
    #############################################
    ########## LLM Response Attributes ##########
    # https://docs.arize.com/arize/large-language-models/tracing/semantic-conventions
    #############################################
    for choice in response_obj.get("choices"):
        response_message = choice.get("message", {})
        span.set_attribute(
            SpanAttributes.OUTPUT_VALUE, response_message.get("content", "")
        )
        # This shows up under `output_messages` tab on the span page
        # This code assumes a single response
        span.set_attribute(
            f"{SpanAttributes.LLM_OUTPUT_MESSAGES}.0.{MessageAttributes.MESSAGE_ROLE}",
            response_message["role"],
        )
        span.set_attribute(
            f"{SpanAttributes.LLM_OUTPUT_MESSAGES}.0.{MessageAttributes.MESSAGE_CONTENT}",
            response_message.get("content", ""),
        )
    usage = response_obj.get("usage")
    if usage:
        span.set_attribute(
            SpanAttributes.LLM_TOKEN_COUNT_TOTAL,
            usage.get("total_tokens"),
        )
        # The number of tokens used in the LLM response (completion).
        span.set_attribute(
            SpanAttributes.LLM_TOKEN_COUNT_COMPLETION,
            usage.get("completion_tokens"),
        )
        # The number of tokens used in the LLM prompt.
        span.set_attribute(
            SpanAttributes.LLM_TOKEN_COUNT_PROMPT,
            usage.get("prompt_tokens"),
        )
    pass
--- a/litellm/integrations/braintrust_logging.py
+++ b/litellm/integrations/braintrust_logging.py
@ -0,0 +1,369 @@
 # What is this?
 ## Log success + failure events to Braintrust
 import copy
 import json
 import os
 import threading
 import traceback
 import uuid
 from typing import Literal, Optional
 import dotenv
 import httpx
 import litellm
 from litellm import verbose_logger
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.utils import get_formatted_prompt
 global_braintrust_http_handler = AsyncHTTPHandler()
 global_braintrust_sync_http_handler = HTTPHandler()
 API_BASE = "https://api.braintrustdata.com/v1"
 def get_utc_datetime():
    import datetime as dt
    from datetime import datetime
    if hasattr(dt, "UTC"):
        return datetime.now(dt.UTC)  # type: ignore
    else:
        return datetime.utcnow()  # type: ignore
 class BraintrustLogger(CustomLogger):
    def __init__(
        self, api_key: Optional[str] = None, api_base: Optional[str] = None
    ) -> None:
        super().__init__()
        self.validate_environment(api_key=api_key)
        self.api_base = api_base or API_BASE
        self.default_project_id = None
        self.api_key: str = api_key or os.getenv("BRAINTRUST_API_KEY")  # type: ignore
        self.headers = {
            "Authorization": "Bearer " + self.api_key,
            "Content-Type": "application/json",
        }
    def validate_environment(self, api_key: Optional[str]):
        """
        Expects
        BRAINTRUST_API_KEY
        in the environment
        """
        missing_keys = []
        if api_key is None and os.getenv("BRAINTRUST_API_KEY", None) is None:
            missing_keys.append("BRAINTRUST_API_KEY")
        if len(missing_keys) > 0:
            raise Exception("Missing keys={} in environment.".format(missing_keys))
    @staticmethod
    def add_metadata_from_header(litellm_params: dict, metadata: dict) -> dict:
        """
        Adds metadata from proxy request headers to Langfuse logging if keys start with "langfuse_"
        and overwrites litellm_params.metadata if already included.
        For example if you want to append your trace to an existing `trace_id` via header, send
        `headers: { ..., langfuse_existing_trace_id: your-existing-trace-id }` via proxy request.
        """
        if litellm_params is None:
            return metadata
        if litellm_params.get("proxy_server_request") is None:
            return metadata
        if metadata is None:
            metadata = {}
        proxy_headers = (
            litellm_params.get("proxy_server_request", {}).get("headers", {}) or {}
        )
        for metadata_param_key in proxy_headers:
            if metadata_param_key.startswith("braintrust"):
                trace_param_key = metadata_param_key.replace("braintrust", "", 1)
                if trace_param_key in metadata:
                    verbose_logger.warning(
                        f"Overwriting Braintrust `{trace_param_key}` from request header"
                    )
                else:
                    verbose_logger.debug(
                        f"Found Braintrust `{trace_param_key}` in request header"
                    )
                metadata[trace_param_key] = proxy_headers.get(metadata_param_key)
        return metadata
    async def create_default_project_and_experiment(self):
        project = await global_braintrust_http_handler.post(
            f"{self.api_base}/project", headers=self.headers, json={"name": "litellm"}
        )
        project_dict = project.json()
        self.default_project_id = project_dict["id"]
    def create_sync_default_project_and_experiment(self):
        project = global_braintrust_sync_http_handler.post(
            f"{self.api_base}/project", headers=self.headers, json={"name": "litellm"}
        )
        project_dict = project.json()
        self.default_project_id = project_dict["id"]
    def log_success_event(self, kwargs, response_obj, start_time, end_time):
        verbose_logger.debug("REACHES BRAINTRUST SUCCESS")
        try:
            litellm_call_id = kwargs.get("litellm_call_id")
            project_id = kwargs.get("project_id", None)
            if project_id is None:
                if self.default_project_id is None:
                    self.create_sync_default_project_and_experiment()
                project_id = self.default_project_id
            prompt = {"messages": kwargs.get("messages")}
            if response_obj is not None and (
                kwargs.get("call_type", None) == "embedding"
                or isinstance(response_obj, litellm.EmbeddingResponse)
            ):
                input = prompt
                output = None
            elif response_obj is not None and isinstance(
                response_obj, litellm.ModelResponse
            ):
                input = prompt
                output = response_obj["choices"][0]["message"].json()
            elif response_obj is not None and isinstance(
                response_obj, litellm.TextCompletionResponse
            ):
                input = prompt
                output = response_obj.choices[0].text
            elif response_obj is not None and isinstance(
                response_obj, litellm.ImageResponse
            ):
                input = prompt
                output = response_obj["data"]
            litellm_params = kwargs.get("litellm_params", {})
            metadata = (
                litellm_params.get("metadata", {}) or {}
            )  # if litellm_params['metadata'] == None
            metadata = self.add_metadata_from_header(litellm_params, metadata)
            clean_metadata = {}
            try:
                metadata = copy.deepcopy(
                    metadata
                )  # Avoid modifying the original metadata
            except:
                new_metadata = {}
                for key, value in metadata.items():
                    if (
                        isinstance(value, list)
                        or isinstance(value, dict)
                        or isinstance(value, str)
                        or isinstance(value, int)
                        or isinstance(value, float)
                    ):
                        new_metadata[key] = copy.deepcopy(value)
                metadata = new_metadata
            tags = []
            if isinstance(metadata, dict):
                for key, value in metadata.items():
                    # generate langfuse tags - Default Tags sent to Langfuse from LiteLLM Proxy
                    if (
                        litellm._langfuse_default_tags is not None
                        and isinstance(litellm._langfuse_default_tags, list)
                        and key in litellm._langfuse_default_tags
                    ):
                        tags.append(f"{key}:{value}")
                    # clean litellm metadata before logging
                    if key in [
                        "headers",
                        "endpoint",
                        "caching_groups",
                        "previous_models",
                    ]:
                        continue
                    else:
                        clean_metadata[key] = value
            cost = kwargs.get("response_cost", None)
            if cost is not None:
                clean_metadata["litellm_response_cost"] = cost
            metrics: Optional[dict] = None
            if (
                response_obj is not None
                and hasattr(response_obj, "usage")
                and isinstance(response_obj.usage, litellm.Usage)
            ):
                generation_id = litellm.utils.get_logging_id(start_time, response_obj)
                metrics = {
                    "prompt_tokens": response_obj.usage.prompt_tokens,
                    "completion_tokens": response_obj.usage.completion_tokens,
                    "total_tokens": response_obj.usage.total_tokens,
                    "total_cost": cost,
                }
            request_data = {
                "id": litellm_call_id,
                "input": prompt,
                "output": output,
                "metadata": clean_metadata,
                "tags": tags,
            }
            if metrics is not None:
                request_data["metrics"] = metrics
            try:
                global_braintrust_sync_http_handler.post(
                    url=f"{self.api_base}/project_logs/{project_id}/insert",
                    json={"events": [request_data]},
                    headers=self.headers,
                )
            except httpx.HTTPStatusError as e:
                raise Exception(e.response.text)
        except Exception as e:
            verbose_logger.error(
                "Error logging to braintrust - Exception received - {}\n{}".format(
                    str(e), traceback.format_exc()
                )
            )
            raise e
    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
        verbose_logger.debug("REACHES BRAINTRUST SUCCESS")
        try:
            litellm_call_id = kwargs.get("litellm_call_id")
            project_id = kwargs.get("project_id", None)
            if project_id is None:
                if self.default_project_id is None:
                    await self.create_default_project_and_experiment()
                project_id = self.default_project_id
            prompt = {"messages": kwargs.get("messages")}
            if response_obj is not None and (
                kwargs.get("call_type", None) == "embedding"
                or isinstance(response_obj, litellm.EmbeddingResponse)
            ):
                input = prompt
                output = None
            elif response_obj is not None and isinstance(
                response_obj, litellm.ModelResponse
            ):
                input = prompt
                output = response_obj["choices"][0]["message"].json()
            elif response_obj is not None and isinstance(
                response_obj, litellm.TextCompletionResponse
            ):
                input = prompt
                output = response_obj.choices[0].text
            elif response_obj is not None and isinstance(
                response_obj, litellm.ImageResponse
            ):
                input = prompt
                output = response_obj["data"]
            litellm_params = kwargs.get("litellm_params", {})
            metadata = (
                litellm_params.get("metadata", {}) or {}
            )  # if litellm_params['metadata'] == None
            metadata = self.add_metadata_from_header(litellm_params, metadata)
            clean_metadata = {}
            try:
                metadata = copy.deepcopy(
                    metadata
                )  # Avoid modifying the original metadata
            except:
                new_metadata = {}
                for key, value in metadata.items():
                    if (
                        isinstance(value, list)
                        or isinstance(value, dict)
                        or isinstance(value, str)
                        or isinstance(value, int)
                        or isinstance(value, float)
                    ):
                        new_metadata[key] = copy.deepcopy(value)
                metadata = new_metadata
            tags = []
            if isinstance(metadata, dict):
                for key, value in metadata.items():
                    # generate langfuse tags - Default Tags sent to Langfuse from LiteLLM Proxy
                    if (
                        litellm._langfuse_default_tags is not None
                        and isinstance(litellm._langfuse_default_tags, list)
                        and key in litellm._langfuse_default_tags
                    ):
                        tags.append(f"{key}:{value}")
                    # clean litellm metadata before logging
                    if key in [
                        "headers",
                        "endpoint",
                        "caching_groups",
                        "previous_models",
                    ]:
                        continue
                    else:
                        clean_metadata[key] = value
            cost = kwargs.get("response_cost", None)
            if cost is not None:
                clean_metadata["litellm_response_cost"] = cost
            metrics: Optional[dict] = None
            if (
                response_obj is not None
                and hasattr(response_obj, "usage")
                and isinstance(response_obj.usage, litellm.Usage)
            ):
                generation_id = litellm.utils.get_logging_id(start_time, response_obj)
                metrics = {
                    "prompt_tokens": response_obj.usage.prompt_tokens,
                    "completion_tokens": response_obj.usage.completion_tokens,
                    "total_tokens": response_obj.usage.total_tokens,
                    "total_cost": cost,
                }
            request_data = {
                "id": litellm_call_id,
                "input": prompt,
                "output": output,
                "metadata": clean_metadata,
                "tags": tags,
            }
            if metrics is not None:
                request_data["metrics"] = metrics
            try:
                await global_braintrust_http_handler.post(
                    url=f"{self.api_base}/project_logs/{project_id}/insert",
                    json={"events": [request_data]},
                    headers=self.headers,
                )
            except httpx.HTTPStatusError as e:
                raise Exception(e.response.text)
        except Exception as e:
            verbose_logger.error(
                "Error logging to braintrust - Exception received - {}\n{}".format(
                    str(e), traceback.format_exc()
                )
            )
            raise e
    def log_failure_event(self, kwargs, response_obj, start_time, end_time):
        return super().log_failure_event(kwargs, response_obj, start_time, end_time)
--- a/litellm/integrations/langfuse.py
+++ b/litellm/integrations/langfuse.py
@ -8,6 +8,7 @@ from packaging.version import Version
 import litellm
 from litellm._logging import verbose_logger
 from litellm.litellm_core_utils.redact_messages import redact_user_api_key_info
 class LangFuseLogger:
@ -382,6 +383,8 @@ class LangFuseLogger:
            mask_input = clean_metadata.pop("mask_input", False)
            mask_output = clean_metadata.pop("mask_output", False)
            clean_metadata = redact_user_api_key_info(metadata=clean_metadata)
            if trace_name is None and existing_trace_id is None:
                # just log `litellm-{call_type}` as the trace name
                ## DO NOT SET TRACE_NAME if trace-id set. this can lead to overwriting of past traces.
--- a/litellm/integrations/langsmith.py
+++ b/litellm/integrations/langsmith.py
@ -79,6 +79,7 @@ class LangsmithLogger(CustomLogger):
        project_name = metadata.get("project_name", self.langsmith_project)
        run_name = metadata.get("run_name", self.langsmith_default_run_name)
        run_id = metadata.get("id", None)
        tags = metadata.get("tags", []) or []
        verbose_logger.debug(
            f"Langsmith Logging - project_name: {project_name}, run_name {run_name}"
        )
@ -122,6 +123,7 @@ class LangsmithLogger(CustomLogger):
            "session_name": project_name,
            "start_time": start_time,
            "end_time": end_time,
            "tags": tags,
        }
        if run_id:
--- a/litellm/integrations/logfire_logger.py
+++ b/litellm/integrations/logfire_logger.py
@ -1,17 +1,21 @@
 #### What this does ####
 #    On success + failure, log events to Logfire
-import dotenv, os
+import os
 import dotenv
 dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
 import uuid
 from litellm._logging import print_verbose, verbose_logger
 from enum import Enum
 from typing import Any, Dict, NamedTuple
 from typing_extensions import LiteralString
 from litellm._logging import print_verbose, verbose_logger
 from litellm.litellm_core_utils.redact_messages import redact_user_api_key_info
 class SpanConfig(NamedTuple):
    message_template: LiteralString
@ -135,6 +139,8 @@ class LogfireLogger:
                    else:
                        clean_metadata[key] = value
            clean_metadata = redact_user_api_key_info(metadata=clean_metadata)
            # Build the initial payload
            payload = {
                "id": id,
--- a/litellm/integrations/opentelemetry.py
+++ b/litellm/integrations/opentelemetry.py
@ -2,11 +2,12 @@ import os
 from dataclasses import dataclass
 from datetime import datetime
 from functools import wraps
-from typing import TYPE_CHECKING, Any, Optional, Union
+from typing import TYPE_CHECKING, Any, Dict, Optional, Union
 import litellm
 from litellm._logging import verbose_logger
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.litellm_core_utils.redact_messages import redact_user_api_key_info
 from litellm.types.services import ServiceLoggerPayload
 if TYPE_CHECKING:
@ -27,9 +28,10 @@ else:
 LITELLM_TRACER_NAME = os.getenv("OTEL_TRACER_NAME", "litellm")
-LITELLM_RESOURCE = {
+LITELLM_RESOURCE: Dict[Any, Any] = {
    "service.name": os.getenv("OTEL_SERVICE_NAME", "litellm"),
    "deployment.environment": os.getenv("OTEL_ENVIRONMENT_NAME", "production"),
    "model_id": os.getenv("OTEL_SERVICE_NAME", "litellm"),
 }
 RAW_REQUEST_SPAN_NAME = "raw_gen_ai_request"
 LITELLM_REQUEST_SPAN_NAME = "litellm_request"
@ -68,7 +70,9 @@ class OpenTelemetryConfig:
 class OpenTelemetry(CustomLogger):
-    def __init__(self, config=OpenTelemetryConfig.from_env()):
+    def __init__(
        self, config=OpenTelemetryConfig.from_env(), callback_name: Optional[str] = None
    ):
        from opentelemetry import trace
        from opentelemetry.sdk.resources import Resource
        from opentelemetry.sdk.trace import TracerProvider
@ -79,6 +83,7 @@ class OpenTelemetry(CustomLogger):
        self.OTEL_HEADERS = self.config.headers
        provider = TracerProvider(resource=Resource(attributes=LITELLM_RESOURCE))
        provider.add_span_processor(self._get_span_processor())
        self.callback_name = callback_name
        trace.set_tracer_provider(provider)
        self.tracer = trace.get_tracer(LITELLM_TRACER_NAME)
@ -120,8 +125,8 @@ class OpenTelemetry(CustomLogger):
        from opentelemetry import trace
        from opentelemetry.trace import Status, StatusCode
-        _start_time_ns = start_time
+        _start_time_ns = 0
-        _end_time_ns = end_time
+        _end_time_ns = 0
        if isinstance(start_time, float):
            _start_time_ns = int(int(start_time) * 1e9)
@ -159,8 +164,8 @@ class OpenTelemetry(CustomLogger):
        from opentelemetry import trace
        from opentelemetry.trace import Status, StatusCode
-        _start_time_ns = start_time
+        _start_time_ns = 0
-        _end_time_ns = end_time
+        _end_time_ns = 0
        if isinstance(start_time, float):
            _start_time_ns = int(int(start_time) * 1e9)
@ -294,6 +299,11 @@ class OpenTelemetry(CustomLogger):
        return isinstance(value, (str, bool, int, float))
    def set_attributes(self, span: Span, kwargs, response_obj):
        if self.callback_name == "arize":
            from litellm.integrations.arize_ai import set_arize_ai_attributes
            set_arize_ai_attributes(span, kwargs, response_obj)
            return
        from litellm.proxy._types import SpanAttributes
        optional_params = kwargs.get("optional_params", {})
@ -306,7 +316,9 @@ class OpenTelemetry(CustomLogger):
        #############################################
        metadata = litellm_params.get("metadata", {}) or {}
-        for key, value in metadata.items():
+        clean_metadata = redact_user_api_key_info(metadata=metadata)
        for key, value in clean_metadata.items():
            if self.is_primitive(value):
                span.set_attribute("metadata.{}".format(key), value)
@ -612,8 +624,8 @@ class OpenTelemetry(CustomLogger):
        from opentelemetry import trace
        from opentelemetry.trace import Status, StatusCode
-        _start_time_ns = logging_payload.start_time
+        _start_time_ns = 0
-        _end_time_ns = logging_payload.end_time
+        _end_time_ns = 0
        start_time = logging_payload.start_time
        end_time = logging_payload.end_time
@ -658,8 +670,8 @@ class OpenTelemetry(CustomLogger):
        from opentelemetry import trace
        from opentelemetry.trace import Status, StatusCode
-        _start_time_ns = logging_payload.start_time
+        _start_time_ns = 0
-        _end_time_ns = logging_payload.end_time
+        _end_time_ns = 0
        start_time = logging_payload.start_time
        end_time = logging_payload.end_time
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@ -53,6 +53,7 @@ from litellm.utils import (
 from ..integrations.aispend import AISpendLogger
 from ..integrations.athina import AthinaLogger
 from ..integrations.berrispend import BerriSpendLogger
 from ..integrations.braintrust_logging import BraintrustLogger
 from ..integrations.clickhouse import ClickhouseLogger
 from ..integrations.custom_logger import CustomLogger
 from ..integrations.datadog import DataDogLogger
@ -1945,7 +1946,14 @@ def _init_custom_logger_compatible_class(
        _openmeter_logger = OpenMeterLogger()
        _in_memory_loggers.append(_openmeter_logger)
        return _openmeter_logger  # type: ignore
    elif logging_integration == "braintrust":
        for callback in _in_memory_loggers:
            if isinstance(callback, BraintrustLogger):
                return callback  # type: ignore
        braintrust_logger = BraintrustLogger()
        _in_memory_loggers.append(braintrust_logger)
        return braintrust_logger  # type: ignore
    elif logging_integration == "langsmith":
        for callback in _in_memory_loggers:
            if isinstance(callback, LangsmithLogger):
@ -1954,6 +1962,43 @@ def _init_custom_logger_compatible_class(
        _langsmith_logger = LangsmithLogger()
        _in_memory_loggers.append(_langsmith_logger)
        return _langsmith_logger  # type: ignore
    elif logging_integration == "arize":
        if "ARIZE_SPACE_KEY" not in os.environ:
            raise ValueError("ARIZE_SPACE_KEY not found in environment variables")
        if "ARIZE_API_KEY" not in os.environ:
            raise ValueError("ARIZE_API_KEY not found in environment variables")
        from litellm.integrations.opentelemetry import (
            OpenTelemetry,
            OpenTelemetryConfig,
        )
        otel_config = OpenTelemetryConfig(
            exporter="otlp_grpc",
            endpoint="https://otlp.arize.com/v1",
        )
        os.environ["OTEL_EXPORTER_OTLP_TRACES_HEADERS"] = (
            f"space_key={os.getenv('ARIZE_SPACE_KEY')},api_key={os.getenv('ARIZE_API_KEY')}"
        )
        for callback in _in_memory_loggers:
            if (
                isinstance(callback, OpenTelemetry)
                and callback.callback_name == "arize"
            ):
                return callback  # type: ignore
        _otel_logger = OpenTelemetry(config=otel_config, callback_name="arize")
        _in_memory_loggers.append(_otel_logger)
        return _otel_logger  # type: ignore
    elif logging_integration == "otel":
        from litellm.integrations.opentelemetry import OpenTelemetry
        for callback in _in_memory_loggers:
            if isinstance(callback, OpenTelemetry):
                return callback  # type: ignore
        otel_logger = OpenTelemetry()
        _in_memory_loggers.append(otel_logger)
        return otel_logger  # type: ignore
    elif logging_integration == "galileo":
        for callback in _in_memory_loggers:
@ -2019,6 +2064,10 @@ def get_custom_logger_compatible_class(
        for callback in _in_memory_loggers:
            if isinstance(callback, OpenMeterLogger):
                return callback
    elif logging_integration == "braintrust":
        for callback in _in_memory_loggers:
            if isinstance(callback, BraintrustLogger):
                return callback
    elif logging_integration == "galileo":
        for callback in _in_memory_loggers:
            if isinstance(callback, GalileoObserve):
@ -2027,6 +2076,25 @@ def get_custom_logger_compatible_class(
        for callback in _in_memory_loggers:
            if isinstance(callback, LangsmithLogger):
                return callback
    elif logging_integration == "otel":
        from litellm.integrations.opentelemetry import OpenTelemetry
        for callback in _in_memory_loggers:
            if isinstance(callback, OpenTelemetry):
                return callback
    elif logging_integration == "arize":
        from litellm.integrations.opentelemetry import OpenTelemetry
        if "ARIZE_SPACE_KEY" not in os.environ:
            raise ValueError("ARIZE_SPACE_KEY not found in environment variables")
        if "ARIZE_API_KEY" not in os.environ:
            raise ValueError("ARIZE_API_KEY not found in environment variables")
        for callback in _in_memory_loggers:
            if (
                isinstance(callback, OpenTelemetry)
                and callback.callback_name == "arize"
            ):
                return callback
    elif logging_integration == "logfire":
        if "LOGFIRE_TOKEN" not in os.environ:
            raise ValueError("LOGFIRE_TOKEN not found in environment variables")
--- a/litellm/litellm_core_utils/redact_messages.py
+++ b/litellm/litellm_core_utils/redact_messages.py
@ -87,3 +87,33 @@ def redact_message_input_output_from_logging(
    # by default return result
    return result
 def redact_user_api_key_info(metadata: dict) -> dict:
    """
    removes any user_api_key_info before passing to logging object, if flag set
    Usage:
    SDK
    ```python
    litellm.redact_user_api_key_info = True
    ```
    PROXY:
    ```yaml
    litellm_settings:
        redact_user_api_key_info: true
    ```
    """
    if litellm.redact_user_api_key_info is not True:
        return metadata
    new_metadata = {}
    for k, v in metadata.items():
        if isinstance(k, str) and k.startswith("user_api_key"):
            pass
        else:
            new_metadata[k] = v
    return new_metadata
--- a/litellm/llms/anthropic.py
+++ b/litellm/llms/anthropic.py
@ -385,6 +385,11 @@ class AnthropicConfig:
            if "user_id" in anthropic_message_request["metadata"]:
                new_kwargs["user"] = anthropic_message_request["metadata"]["user_id"]
        # Pass litellm proxy specific metadata
        if "litellm_metadata" in anthropic_message_request:
            # metadata will be passed to litellm.acompletion(), it's a litellm_param
            new_kwargs["metadata"] = anthropic_message_request.pop("litellm_metadata")
        ## CONVERT TOOL CHOICE
        if "tool_choice" in anthropic_message_request:
            new_kwargs["tool_choice"] = self.translate_anthropic_tool_choice_to_openai(
@ -775,8 +780,17 @@ class AnthropicChatCompletion(BaseLLM):
            system_prompt = ""
            for idx, message in enumerate(messages):
                if message["role"] == "system":
-                    system_prompt += message["content"]
+                    valid_content: bool = False
-                    system_prompt_indices.append(idx)
+                    if isinstance(message["content"], str):
                        system_prompt += message["content"]
                        valid_content = True
                    elif isinstance(message["content"], list):
                        for content in message["content"]:
                            system_prompt += content.get("text", "")
                        valid_content = True
                    if valid_content:
                        system_prompt_indices.append(idx)
            if len(system_prompt_indices) > 0:
                for idx in reversed(system_prompt_indices):
                    messages.pop(idx)
--- a/litellm/llms/bedrock_httpx.py
+++ b/litellm/llms/bedrock_httpx.py
@ -76,6 +76,8 @@ BEDROCK_CONVERSE_MODELS = [
    "anthropic.claude-v1",
    "anthropic.claude-instant-v1",
    "ai21.jamba-instruct-v1:0",
    "meta.llama3-1-8b-instruct-v1:0",
    "meta.llama3-1-70b-instruct-v1:0",
 ]
@ -1729,7 +1731,7 @@ class BedrockConverseLLM(BaseLLM):
        headers={},
        client: Optional[AsyncHTTPHandler] = None,
    ) -> Union[ModelResponse, CustomStreamWrapper]:
-        if client is None:
+        if client is None or not isinstance(client, AsyncHTTPHandler):
            _params = {}
            if timeout is not None:
                if isinstance(timeout, float) or isinstance(timeout, int):
--- a/litellm/llms/openai.py
+++ b/litellm/llms/openai.py
@ -968,7 +968,7 @@ class OpenAIChatCompletion(BaseLLM):
                except openai.UnprocessableEntityError as e:
                    ## check if body contains unprocessable params - related issue https://github.com/BerriAI/litellm/issues/4800
                    if litellm.drop_params is True or drop_params is True:
-                        if e.body is not None and e.body.get("detail"):  # type: ignore
+                        if e.body is not None and isinstance(e.body, dict) and e.body.get("detail"):  # type: ignore
                            detail = e.body.get("detail")  # type: ignore
                            invalid_params: List[str] = []
                            if (
@ -1100,7 +1100,7 @@ class OpenAIChatCompletion(BaseLLM):
            except openai.UnprocessableEntityError as e:
                ## check if body contains unprocessable params - related issue https://github.com/BerriAI/litellm/issues/4800
                if litellm.drop_params is True or drop_params is True:
-                    if e.body is not None and e.body.get("detail"):  # type: ignore
+                    if e.body is not None and isinstance(e.body, dict) and e.body.get("detail"):  # type: ignore
                        detail = e.body.get("detail")  # type: ignore
                        invalid_params: List[str] = []
                        if (
@ -1231,7 +1231,7 @@ class OpenAIChatCompletion(BaseLLM):
            except openai.UnprocessableEntityError as e:
                ## check if body contains unprocessable params - related issue https://github.com/BerriAI/litellm/issues/4800
                if litellm.drop_params is True or drop_params is True:
-                    if e.body is not None and e.body.get("detail"):  # type: ignore
+                    if e.body is not None and isinstance(e.body, dict) and e.body.get("detail"):  # type: ignore
                        detail = e.body.get("detail")  # type: ignore
                        invalid_params: List[str] = []
                        if (
--- a/litellm/llms/triton.py
+++ b/litellm/llms/triton.py
@ -1,23 +1,31 @@
 import copy
 import json
 import os
 import time
 import types
 from enum import Enum
-from typing import Callable, List, Optional
+from typing import Any, Callable, Dict, List, Optional, Sequence, Union
 import httpx  # type: ignore
 import requests  # type: ignore
 import litellm
-from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
+from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.utils import (
    Choices,
    CustomStreamWrapper,
    Delta,
    EmbeddingResponse,
    Message,
    ModelResponse,
    Usage,
    map_finish_reason,
 )
 from .base import BaseLLM
 from .prompt_templates.factory import custom_prompt, prompt_factory
 class TritonError(Exception):
-    def __init__(self, status_code, message):
+    def __init__(self, status_code: int, message: str) -> None:
        self.status_code = status_code
        self.message = message
        self.request = httpx.Request(
@ -41,8 +49,7 @@ class TritonChatCompletion(BaseLLM):
        api_base: str,
        logging_obj=None,
        api_key: Optional[str] = None,
-    ):
+    ) -> EmbeddingResponse:
        async_handler = AsyncHTTPHandler(
            timeout=httpx.Timeout(timeout=600.0, connect=5.0)
        )
@ -79,10 +86,10 @@ class TritonChatCompletion(BaseLLM):
        return model_response
-    def embedding(
+    async def embedding(
        self,
        model: str,
-        input: list,
+        input: List[str],
        timeout: float,
        api_base: str,
        model_response: litellm.utils.EmbeddingResponse,
@ -90,8 +97,8 @@ class TritonChatCompletion(BaseLLM):
        logging_obj=None,
        optional_params=None,
        client=None,
-        aembedding=None,
+        aembedding: bool = False,
-    ):
+    ) -> EmbeddingResponse:
        data_for_triton = {
            "inputs": [
                {
@ -103,8 +110,6 @@ class TritonChatCompletion(BaseLLM):
            ]
        }
        ## LOGGING
        curl_string = f"curl {api_base} -X POST -H 'Content-Type: application/json' -d '{data_for_triton}'"
        logging_obj.pre_call(
@ -116,8 +121,8 @@ class TritonChatCompletion(BaseLLM):
            },
        )
-        if aembedding == True:
+        if aembedding:
-            response = self.aembedding(
+            response = await self.aembedding(
                data=data_for_triton,
                model_response=model_response,
                logging_obj=logging_obj,
@ -130,6 +135,198 @@ class TritonChatCompletion(BaseLLM):
                "Only async embedding supported for triton, please use litellm.aembedding() for now"
            )
    def completion(
        self,
        model: str,
        messages: List[dict],
        timeout: float,
        api_base: str,
        model_response: ModelResponse,
        api_key: Optional[str] = None,
        logging_obj=None,
        optional_params=None,
        client=None,
        stream: Optional[bool] = False,
        acompletion: bool = False,
    ) -> ModelResponse:
        type_of_model = ""
        optional_params.pop("stream", False)
        if api_base.endswith("generate"):  ### This is a trtllm model
            text_input = messages[0]["content"]
            data_for_triton: Dict[str, Any] = {
                "text_input": prompt_factory(model=model, messages=messages),
                "parameters": {
                    "max_tokens": int(optional_params.get("max_tokens", 2000)),
                    "bad_words": [""],
                    "stop_words": [""],
                },
                "stream": bool(stream),
            }
            data_for_triton["parameters"].update(optional_params)
            type_of_model = "trtllm"
        elif api_base.endswith(
            "infer"
        ):  ### This is an infer model with a custom model on triton
            text_input = messages[0]["content"]
            data_for_triton = {
                "inputs": [
                    {
                        "name": "text_input",
                        "shape": [1],
                        "datatype": "BYTES",
                        "data": [text_input],
                    }
                ]
            }
            for k, v in optional_params.items():
                if not (k == "stream" or k == "max_retries"):
                    datatype = "INT32" if isinstance(v, int) else "BYTES"
                    datatype = "FP32" if isinstance(v, float) else datatype
                    data_for_triton["inputs"].append(
                        {"name": k, "shape": [1], "datatype": datatype, "data": [v]}
                    )
            if "max_tokens" not in optional_params:
                data_for_triton["inputs"].append(
                    {
                        "name": "max_tokens",
                        "shape": [1],
                        "datatype": "INT32",
                        "data": [20],
                    }
                )
            type_of_model = "infer"
        else:  ## Unknown model type passthrough
            data_for_triton = {
                "inputs": [
                    {
                        "name": "text_input",
                        "shape": [1],
                        "datatype": "BYTES",
                        "data": [messages[0]["content"]],
                    }
                ]
            }
        if logging_obj:
            logging_obj.pre_call(
                input=messages,
                api_key=api_key,
                additional_args={
                    "complete_input_dict": optional_params,
                    "api_base": api_base,
                    "http_client": client,
                },
            )
        headers = {"Content-Type": "application/json"}
        json_data_for_triton: str = json.dumps(data_for_triton)
        if acompletion:
            return self.acompletion(  # type: ignore
                model,
                json_data_for_triton,
                headers=headers,
                logging_obj=logging_obj,
                api_base=api_base,
                stream=stream,
                model_response=model_response,
                type_of_model=type_of_model,
            )
        else:
            handler = HTTPHandler()
        if stream:
            return self._handle_stream(
                handler, api_base, data_for_triton, model, logging_obj
            )
        else:
            response = handler.post(url=api_base, data=data_for_triton, headers=headers)
            return self._handle_response(
                response, model_response, logging_obj, type_of_model=type_of_model
            )
    async def acompletion(
        self,
        model: str,
        data_for_triton,
        api_base,
        stream,
        logging_obj,
        headers,
        model_response,
        type_of_model,
    ) -> ModelResponse:
        handler = AsyncHTTPHandler()
        if stream:
            return self._ahandle_stream(
                handler, api_base, data_for_triton, model, logging_obj
            )
        else:
            response = await handler.post(
                url=api_base, data=data_for_triton, headers=headers
            )
            return self._handle_response(
                response, model_response, logging_obj, type_of_model=type_of_model
            )
    def _handle_stream(self, handler, api_base, data_for_triton, model, logging_obj):
        response = handler.post(
            url=api_base + "_stream", data=data_for_triton, stream=True
        )
        streamwrapper = litellm.CustomStreamWrapper(
            response.iter_lines(),
            model=model,
            custom_llm_provider="triton",
            logging_obj=logging_obj,
        )
        for chunk in streamwrapper:
            yield (chunk)
    async def _ahandle_stream(
        self, handler, api_base, data_for_triton, model, logging_obj
    ):
        response = await handler.post(
            url=api_base + "_stream", data=data_for_triton, stream=True
        )
        streamwrapper = litellm.CustomStreamWrapper(
            response.aiter_lines(),
            model=model,
            custom_llm_provider="triton",
            logging_obj=logging_obj,
        )
        async for chunk in streamwrapper:
            yield (chunk)
    def _handle_response(self, response, model_response, logging_obj, type_of_model):
        if logging_obj:
            logging_obj.post_call(original_response=response)
        if response.status_code != 200:
            raise TritonError(status_code=response.status_code, message=response.text)
        _json_response = response.json()
        model_response.model = _json_response.get("model_name", "None")
        if type_of_model == "trtllm":
            model_response.choices = [
                Choices(index=0, message=Message(content=_json_response["text_output"]))
            ]
        elif type_of_model == "infer":
            model_response.choices = [
                Choices(
                    index=0,
                    message=Message(content=_json_response["outputs"][0]["data"]),
                )
            ]
        else:
            model_response.choices = [
                Choices(index=0, message=Message(content=_json_response["outputs"]))
            ]
        return model_response
    @staticmethod
    def split_embedding_by_shape(
        data: List[float], shape: List[int]
--- a/litellm/llms/vertex_ai_llama.py
+++ b/litellm/llms/vertex_ai_llama.py
@ -0,0 +1,203 @@
 # What is this?
 ## Handler for calling llama 3.1 API on Vertex AI
 import copy
 import json
 import os
 import time
 import types
 import uuid
 from enum import Enum
 from typing import Any, Callable, List, Optional, Tuple, Union
 import httpx  # type: ignore
 import requests  # type: ignore
 import litellm
 from litellm.litellm_core_utils.core_helpers import map_finish_reason
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.types.llms.anthropic import (
    AnthropicMessagesTool,
    AnthropicMessagesToolChoice,
 )
 from litellm.types.llms.openai import (
    ChatCompletionToolParam,
    ChatCompletionToolParamFunctionChunk,
 )
 from litellm.types.utils import ResponseFormatChunk
 from litellm.utils import CustomStreamWrapper, ModelResponse, Usage
 from .base import BaseLLM
 from .prompt_templates.factory import (
    construct_tool_use_system_prompt,
    contains_tag,
    custom_prompt,
    extract_between_tags,
    parse_xml_params,
    prompt_factory,
    response_schema_prompt,
 )
 class VertexAIError(Exception):
    def __init__(self, status_code, message):
        self.status_code = status_code
        self.message = message
        self.request = httpx.Request(
            method="POST", url=" https://cloud.google.com/vertex-ai/"
        )
        self.response = httpx.Response(status_code=status_code, request=self.request)
        super().__init__(
            self.message
        )  # Call the base class constructor with the parameters it needs
 class VertexAILlama3Config:
    """
    Reference:https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/llama#streaming
    The class `VertexAILlama3Config` provides configuration for the VertexAI's Llama API interface. Below are the parameters:
    - `max_tokens` Required (integer) max tokens,
    Note: Please make sure to modify the default parameters as required for your use case.
    """
    max_tokens: Optional[int] = None
    def __init__(
        self,
        max_tokens: Optional[int] = None,
    ) -> None:
        locals_ = locals()
        for key, value in locals_.items():
            if key == "max_tokens" and value is None:
                value = self.max_tokens
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
    @classmethod
    def get_config(cls):
        return {
            k: v
            for k, v in cls.__dict__.items()
            if not k.startswith("__")
            and not isinstance(
                v,
                (
                    types.FunctionType,
                    types.BuiltinFunctionType,
                    classmethod,
                    staticmethod,
                ),
            )
            and v is not None
        }
    def get_supported_openai_params(self):
        return [
            "max_tokens",
            "stream",
        ]
    def map_openai_params(self, non_default_params: dict, optional_params: dict):
        for param, value in non_default_params.items():
            if param == "max_tokens":
                optional_params["max_tokens"] = value
        return optional_params
 class VertexAILlama3(BaseLLM):
    def __init__(self) -> None:
        pass
    def create_vertex_llama3_url(
        self, vertex_location: str, vertex_project: str
    ) -> str:
        return f"https://{vertex_location}-aiplatform.googleapis.com/v1beta1/projects/{vertex_project}/locations/{vertex_location}/endpoints/openapi"
    def completion(
        self,
        model: str,
        messages: list,
        model_response: ModelResponse,
        print_verbose: Callable,
        encoding,
        logging_obj,
        optional_params: dict,
        custom_prompt_dict: dict,
        headers: Optional[dict],
        timeout: Union[float, httpx.Timeout],
        vertex_project=None,
        vertex_location=None,
        vertex_credentials=None,
        litellm_params=None,
        logger_fn=None,
        acompletion: bool = False,
        client=None,
    ):
        try:
            import vertexai
            from google.cloud import aiplatform
            from litellm.llms.openai import OpenAIChatCompletion
            from litellm.llms.vertex_httpx import VertexLLM
        except Exception:
            raise VertexAIError(
                status_code=400,
                message="""vertexai import failed please run `pip install -U "google-cloud-aiplatform>=1.38"`""",
            )
        if not (
            hasattr(vertexai, "preview") or hasattr(vertexai.preview, "language_models")
        ):
            raise VertexAIError(
                status_code=400,
                message="""Upgrade vertex ai. Run `pip install "google-cloud-aiplatform>=1.38"`""",
            )
        try:
            vertex_httpx_logic = VertexLLM()
            access_token, project_id = vertex_httpx_logic._ensure_access_token(
                credentials=vertex_credentials, project_id=vertex_project
            )
            openai_chat_completions = OpenAIChatCompletion()
            ## Load Config
            # config = litellm.VertexAILlama3.get_config()
            # for k, v in config.items():
            #     if k not in optional_params:
            #         optional_params[k] = v
            ## CONSTRUCT API BASE
            stream: bool = optional_params.get("stream", False) or False
            optional_params["stream"] = stream
            api_base = self.create_vertex_llama3_url(
                vertex_location=vertex_location or "us-central1",
                vertex_project=vertex_project or project_id,
            )
            return openai_chat_completions.completion(
                model=model,
                messages=messages,
                api_base=api_base,
                api_key=access_token,
                custom_prompt_dict=custom_prompt_dict,
                model_response=model_response,
                print_verbose=print_verbose,
                logging_obj=logging_obj,
                optional_params=optional_params,
                acompletion=acompletion,
                litellm_params=litellm_params,
                logger_fn=logger_fn,
                client=client,
                timeout=timeout,
            )
        except Exception as e:
            raise VertexAIError(status_code=500, message=str(e))
--- a/litellm/llms/vertex_httpx.py
+++ b/litellm/llms/vertex_httpx.py
@ -1033,7 +1033,7 @@ class VertexLLM(BaseLLM):
                model=model, custom_llm_provider=_custom_llm_provider
            )
        except Exception as e:
-            verbose_logger.error(
+            verbose_logger.warning(
                "Unable to identify if system message supported. Defaulting to 'False'. Received error message - {}\nAdd it here - https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json".format(
                    str(e)
                )
@ -1189,7 +1189,7 @@ class VertexLLM(BaseLLM):
            response.raise_for_status()
        except httpx.HTTPStatusError as err:
            error_code = err.response.status_code
-            raise VertexAIError(status_code=error_code, message=response.text)
+            raise VertexAIError(status_code=error_code, message=err.response.text)
        except httpx.TimeoutException:
            raise VertexAIError(status_code=408, message="Timeout error occurred.")
--- a/litellm/main.py
+++ b/litellm/main.py
@ -120,6 +120,7 @@ from .llms.prompt_templates.factory import (
 )
 from .llms.text_completion_codestral import CodestralTextCompletion
 from .llms.triton import TritonChatCompletion
 from .llms.vertex_ai_llama import VertexAILlama3
 from .llms.vertex_httpx import VertexLLM
 from .llms.watsonx import IBMWatsonXAI
 from .types.llms.openai import HttpxBinaryResponseContent
@ -156,6 +157,7 @@ triton_chat_completions = TritonChatCompletion()
 bedrock_chat_completion = BedrockLLM()
 bedrock_converse_chat_completion = BedrockConverseLLM()
 vertex_chat_completion = VertexLLM()
 vertex_llama_chat_completion = VertexAILlama3()
 watsonxai = IBMWatsonXAI()
 ####### COMPLETION ENDPOINTS ################
@ -375,6 +377,7 @@ async def acompletion(
            or custom_llm_provider == "predibase"
            or custom_llm_provider == "bedrock"
            or custom_llm_provider == "databricks"
            or custom_llm_provider == "triton"
            or custom_llm_provider == "clarifai"
            or custom_llm_provider == "watsonx"
            or custom_llm_provider in litellm.openai_compatible_providers
@ -1491,6 +1494,10 @@ def completion(
                    or get_secret("ANTHROPIC_BASE_URL")
                    or "https://api.anthropic.com/v1/complete"
                )
                if api_base is not None and not api_base.endswith("/v1/complete"):
                    api_base += "/v1/complete"
                response = anthropic_text_completions.completion(
                    model=model,
                    messages=messages,
@ -1517,6 +1524,10 @@ def completion(
                    or get_secret("ANTHROPIC_BASE_URL")
                    or "https://api.anthropic.com/v1/messages"
                )
                if api_base is not None and not api_base.endswith("/v1/messages"):
                    api_base += "/v1/messages"
                response = anthropic_chat_completions.completion(
                    model=model,
                    messages=messages,
@ -2055,7 +2066,26 @@ def completion(
                    timeout=timeout,
                    client=client,
                )
-
+            elif model.startswith("meta/"):
                model_response = vertex_llama_chat_completion.completion(
                    model=model,
                    messages=messages,
                    model_response=model_response,
                    print_verbose=print_verbose,
                    optional_params=new_params,
                    litellm_params=litellm_params,
                    logger_fn=logger_fn,
                    encoding=encoding,
                    vertex_location=vertex_ai_location,
                    vertex_project=vertex_ai_project,
                    vertex_credentials=vertex_credentials,
                    logging_obj=logging,
                    acompletion=acompletion,
                    headers=headers,
                    custom_prompt_dict=custom_prompt_dict,
                    timeout=timeout,
                    client=client,
                )
            else:
                model_response = vertex_ai.completion(
                    model=model,
@ -2469,6 +2499,25 @@ def completion(
                return generator
            response = generator
        elif custom_llm_provider == "triton":
            api_base = litellm.api_base or api_base
            model_response = triton_chat_completions.completion(
                api_base=api_base,
                timeout=timeout,  # type: ignore
                model=model,
                messages=messages,
                model_response=model_response,
                optional_params=optional_params,
                logging_obj=logging,
                stream=stream,
                acompletion=acompletion,
            )
            ## RESPONSE OBJECT
            response = model_response
            return response
        elif custom_llm_provider == "cloudflare":
            api_key = (
                api_key
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -760,6 +760,36 @@
        "litellm_provider": "azure_ai",
        "mode": "chat"
    },
    "azure_ai/Meta-Llama-31-8B-Instruct": {
        "max_tokens": 128000,
        "max_input_tokens": 128000,
        "max_output_tokens": 128000,
        "input_cost_per_token": 0.0000003,
        "output_cost_per_token": 0.00000061,
        "litellm_provider": "azure_ai",
        "mode": "chat",
        "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-8b-instruct-offer?tab=PlansAndPrice"
    },
    "azure_ai/Meta-Llama-31-70B-Instruct": {
        "max_tokens": 128000,
        "max_input_tokens": 128000,
        "max_output_tokens": 128000,
        "input_cost_per_token": 0.00000268,
        "output_cost_per_token": 0.00000354,
        "litellm_provider": "azure_ai",
        "mode": "chat",
        "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-70b-instruct-offer?tab=PlansAndPrice"
    },
    "azure_ai/Meta-Llama-31-405B-Instruct": {
        "max_tokens": 128000,
        "max_input_tokens": 128000,
        "max_output_tokens": 128000,
        "input_cost_per_token": 0.00000533,
        "output_cost_per_token": 0.000016,
        "litellm_provider": "azure_ai",
        "mode": "chat",
        "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-405b-instruct-offer?tab=PlansAndPrice"
    },
    "babbage-002": {
        "max_tokens": 16384,
        "max_input_tokens": 16384,
@ -1948,6 +1978,16 @@
        "supports_function_calling": true,
        "supports_vision": true
    },
    "vertex_ai/meta/llama3-405b-instruct-maas": {
        "max_tokens": 32000,
        "max_input_tokens": 32000,
        "max_output_tokens": 32000,
        "input_cost_per_token": 0.0,
        "output_cost_per_token": 0.0,
        "litellm_provider": "vertex_ai-llama_models",
        "mode": "chat",
        "source": "https://cloud.google.com/vertex-ai/generative-ai/pricing#partner-models"
    },
    "vertex_ai/imagegeneration@006": {
        "cost_per_image": 0.020,
        "litellm_provider": "vertex_ai-image-models",
@ -3633,6 +3673,24 @@
        "litellm_provider": "bedrock",
        "mode": "chat"
    },
    "meta.llama3-1-8b-instruct-v1:0": {
        "max_tokens": 128000,
        "max_input_tokens": 128000,
        "max_output_tokens": 2048,
        "input_cost_per_token": 0.0000004,
        "output_cost_per_token": 0.0000006,
        "litellm_provider": "bedrock",
        "mode": "chat"
    },
    "meta.llama3-1-70b-instruct-v1:0": {
        "max_tokens": 128000,
        "max_input_tokens": 128000,
        "max_output_tokens": 2048,
        "input_cost_per_token": 0.00000265,
        "output_cost_per_token": 0.0000035,
        "litellm_provider": "bedrock",
        "mode": "chat"
    },
    "512-x-512/50-steps/stability.stable-diffusion-xl-v0": {
        "max_tokens": 77, 
        "max_input_tokens": 77, 
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@ -1,5 +1,8 @@
 model_list:
-  - model_name: groq-llama3
+  - model_name: "*"             # all requests where model not in your config go to this deployment
    litellm_params:
-      model: groq/llama3-groq-70b-8192-tool-use-preview
+      model: "openai/*"           # passes our validation check that a real provider is given
-      api_key: os.environ/GROQ_API_KEY
+      api_key: "" 
 general_settings:
  completion_model: "gpt-3.5-turbo"
--- a/litellm/proxy/_types.py
+++ b/litellm/proxy/_types.py
@ -228,6 +228,10 @@ class LiteLLMRoutes(enum.Enum):
        "/utils/token_counter",
    ]
    anthropic_routes: List = [
        "/v1/messages",
    ]
    info_routes: List = [
        "/key/info",
        "/team/info",
@ -880,6 +884,26 @@ class BlockTeamRequest(LiteLLMBase):
    team_id: str  # required
 class AddTeamCallback(LiteLLMBase):
    callback_name: str
    callback_type: Literal["success", "failure", "success_and_failure"]
    # for now - only supported for langfuse
    callback_vars: Dict[
        Literal["langfuse_public_key", "langfuse_secret_key", "langfuse_host"], str
    ]
 class TeamCallbackMetadata(LiteLLMBase):
    success_callback: Optional[List[str]] = []
    failure_callback: Optional[List[str]] = []
    # for now - only supported for langfuse
    callback_vars: Optional[
        Dict[
            Literal["langfuse_public_key", "langfuse_secret_key", "langfuse_host"], str
        ]
    ] = {}
 class LiteLLM_TeamTable(TeamBase):
    spend: Optional[float] = None
    max_parallel_requests: Optional[int] = None
@ -1232,6 +1256,7 @@ class LiteLLM_VerificationTokenView(LiteLLM_VerificationToken):
    soft_budget: Optional[float] = None
    team_model_aliases: Optional[Dict] = None
    team_member_spend: Optional[float] = None
    team_metadata: Optional[Dict] = None
    # End User Params
    end_user_id: Optional[str] = None
@ -1677,3 +1702,5 @@ class ProxyErrorTypes(str, enum.Enum):
    budget_exceeded = "budget_exceeded"
    expired_key = "expired_key"
    auth_error = "auth_error"
    internal_server_error = "internal_server_error"
    bad_request_error = "bad_request_error"
--- a/litellm/proxy/auth/auth_checks.py
+++ b/litellm/proxy/auth/auth_checks.py
@ -24,7 +24,7 @@ from litellm.proxy._types import (
    LitellmUserRoles,
    UserAPIKeyAuth,
 )
-from litellm.proxy.auth.auth_utils import is_openai_route
+from litellm.proxy.auth.auth_utils import is_llm_api_route
 from litellm.proxy.utils import PrismaClient, ProxyLogging, log_to_opentelemetry
 from litellm.types.services import ServiceLoggerPayload, ServiceTypes
@ -57,6 +57,7 @@ def common_checks(
    4. If end_user (either via JWT or 'user' passed to /chat/completions, /embeddings endpoint) is in budget
    5. [OPTIONAL] If 'enforce_end_user' enabled - did developer pass in 'user' param for openai endpoints
    6. [OPTIONAL] If 'litellm.max_budget' is set (>0), is proxy under budget
    7. [OPTIONAL] If guardrails modified - is request allowed to change this
    """
    _model = request_body.get("model", None)
    if team_object is not None and team_object.blocked is True:
@ -106,7 +107,7 @@ def common_checks(
        general_settings.get("enforce_user_param", None) is not None
        and general_settings["enforce_user_param"] == True
    ):
-        if is_openai_route(route=route) and "user" not in request_body:
+        if is_llm_api_route(route=route) and "user" not in request_body:
            raise Exception(
                f"'user' param not passed in. 'enforce_user_param'={general_settings['enforce_user_param']}"
            )
@ -122,7 +123,7 @@ def common_checks(
                + CommonProxyErrors.not_premium_user.value
            )
-        if is_openai_route(route=route):
+        if is_llm_api_route(route=route):
            # loop through each enforced param
            # example enforced_params ['user', 'metadata', 'metadata.generation_name']
            for enforced_param in general_settings["enforced_params"]:
@ -150,7 +151,7 @@ def common_checks(
        and global_proxy_spend is not None
        # only run global budget checks for OpenAI routes
        # Reason - the Admin UI should continue working if the proxy crosses it's global budget
-        and is_openai_route(route=route)
+        and is_llm_api_route(route=route)
        and route != "/v1/models"
        and route != "/models"
    ):
@ -158,6 +159,22 @@ def common_checks(
            raise litellm.BudgetExceededError(
                current_cost=global_proxy_spend, max_budget=litellm.max_budget
            )
    _request_metadata: dict = request_body.get("metadata", {}) or {}
    if _request_metadata.get("guardrails"):
        # check if team allowed to modify guardrails
        from litellm.proxy.guardrails.guardrail_helpers import can_modify_guardrails
        can_modify: bool = can_modify_guardrails(team_object)
        if can_modify is False:
            from fastapi import HTTPException
            raise HTTPException(
                status_code=403,
                detail={
                    "error": "Your team does not have permission to modify guardrails."
                },
            )
    return True
--- a/litellm/proxy/auth/auth_utils.py
+++ b/litellm/proxy/auth/auth_utils.py
@ -46,7 +46,7 @@ def route_in_additonal_public_routes(current_route: str):
        return False
-def is_openai_route(route: str) -> bool:
+def is_llm_api_route(route: str) -> bool:
    """
    Helper to checks if provided route is an OpenAI route
@ -59,6 +59,9 @@ def is_openai_route(route: str) -> bool:
    if route in LiteLLMRoutes.openai_routes.value:
        return True
    if route in LiteLLMRoutes.anthropic_routes.value:
        return True
    # fuzzy match routes like "/v1/threads/thread_49EIN5QF32s4mH20M7GFKdlZ"
    # Check for routes with placeholders
    for openai_route in LiteLLMRoutes.openai_routes.value:
--- a/litellm/proxy/auth/user_api_key_auth.py
+++ b/litellm/proxy/auth/user_api_key_auth.py
@ -57,7 +57,7 @@ from litellm.proxy.auth.auth_checks import (
    log_to_opentelemetry,
 )
 from litellm.proxy.auth.auth_utils import (
-    is_openai_route,
+    is_llm_api_route,
    route_in_additonal_public_routes,
 )
 from litellm.proxy.common_utils.http_parsing_utils import _read_request_body
@ -924,6 +924,7 @@ async def user_api_key_auth(
                rpm_limit=valid_token.team_rpm_limit,
                blocked=valid_token.team_blocked,
                models=valid_token.team_models,
                metadata=valid_token.team_metadata,
            )
            user_api_key_cache.set_cache(
@ -994,9 +995,9 @@ async def user_api_key_auth(
            _user_role = _get_user_role(user_id_information=user_id_information)
            if not _is_user_proxy_admin(user_id_information):  # if non-admin
-                if is_openai_route(route=route):
+                if is_llm_api_route(route=route):
                    pass
-                elif is_openai_route(route=request["route"].name):
+                elif is_llm_api_route(route=request["route"].name):
                    pass
                elif (
                    route in LiteLLMRoutes.info_routes.value
@ -1049,7 +1050,7 @@ async def user_api_key_auth(
                    pass
                elif _user_role == LitellmUserRoles.PROXY_ADMIN_VIEW_ONLY.value:
-                    if is_openai_route(route=route):
+                    if is_llm_api_route(route=route):
                        raise HTTPException(
                            status_code=status.HTTP_403_FORBIDDEN,
                            detail=f"user not allowed to access this OpenAI routes, role= {_user_role}",
--- a/litellm/proxy/common_utils/init_callbacks.py
+++ b/litellm/proxy/common_utils/init_callbacks.py
@ -23,11 +23,11 @@ def initialize_callbacks_on_proxy(
    )
    if isinstance(value, list):
        imported_list: List[Any] = []
        known_compatible_callbacks = list(
            get_args(litellm._custom_logger_compatible_callbacks_literal)
        )
        for callback in value:  # ["presidio", <my-custom-callback>]
-            if isinstance(callback, str) and callback in known_compatible_callbacks:
+            if (
                isinstance(callback, str)
                and callback in litellm._known_custom_logger_compatible_callbacks
            ):
                imported_list.append(callback)
            elif isinstance(callback, str) and callback == "otel":
                from litellm.integrations.opentelemetry import OpenTelemetry
--- a/litellm/proxy/guardrails/guardrail_helpers.py
+++ b/litellm/proxy/guardrails/guardrail_helpers.py
@ -1,9 +1,26 @@
 from typing import Dict
 import litellm
 from litellm._logging import verbose_proxy_logger
-from litellm.proxy.proxy_server import UserAPIKeyAuth
+from litellm.proxy.proxy_server import LiteLLM_TeamTable, UserAPIKeyAuth
 from litellm.types.guardrails import *
 def can_modify_guardrails(team_obj: Optional[LiteLLM_TeamTable]) -> bool:
    if team_obj is None:
        return True
    team_metadata = team_obj.metadata or {}
    if team_metadata.get("guardrails", None) is not None and isinstance(
        team_metadata.get("guardrails"), Dict
    ):
        if team_metadata.get("guardrails", {}).get("modify_guardrails", None) is False:
            return False
    return True
 async def should_proceed_based_on_metadata(data: dict, guardrail_name: str) -> bool:
    """
    checks if this guardrail should be applied to this call
--- a/litellm/proxy/litellm_pre_call_utils.py
+++ b/litellm/proxy/litellm_pre_call_utils.py
@ -4,7 +4,7 @@ from typing import TYPE_CHECKING, Any, Dict, Optional
 from fastapi import Request
 from litellm._logging import verbose_logger, verbose_proxy_logger
-from litellm.proxy._types import CommonProxyErrors, UserAPIKeyAuth
+from litellm.proxy._types import CommonProxyErrors, TeamCallbackMetadata, UserAPIKeyAuth
 from litellm.types.utils import SupportedCacheControls
 if TYPE_CHECKING:
@ -39,6 +39,9 @@ def _get_metadata_variable_name(request: Request) -> str:
    """
    if "thread" in request.url.path or "assistant" in request.url.path:
        return "litellm_metadata"
    if "/v1/messages" in request.url.path:
        # anthropic API has a field called metadata
        return "litellm_metadata"
    else:
        return "metadata"
@ -207,6 +210,32 @@ async def add_litellm_data_to_request(
                **data,
            }  # add the team-specific configs to the completion call
    # Team Callbacks controls
    if user_api_key_dict.team_metadata is not None:
        team_metadata = user_api_key_dict.team_metadata
        if "callback_settings" in team_metadata:
            callback_settings = team_metadata.get("callback_settings", None) or {}
            callback_settings_obj = TeamCallbackMetadata(**callback_settings)
            verbose_proxy_logger.debug(
                "Team callback settings activated: %s", callback_settings_obj
            )
            """
            callback_settings = {
              {
                'callback_vars': {'langfuse_public_key': 'pk', 'langfuse_secret_key': 'sk_'}, 
                'failure_callback': [], 
                'success_callback': ['langfuse', 'langfuse']
            }
            }
            """
            data["success_callback"] = callback_settings_obj.success_callback
            data["failure_callback"] = callback_settings_obj.failure_callback
            if callback_settings_obj.callback_vars is not None:
                # unpack callback_vars in data
                for k, v in callback_settings_obj.callback_vars.items():
                    data[k] = v
    return data
--- a/litellm/proxy/management_endpoints/key_management_endpoints.py
+++ b/litellm/proxy/management_endpoints/key_management_endpoints.py
@ -333,6 +333,13 @@ async def update_key_fn(
            expires = datetime.now(timezone.utc) + timedelta(seconds=duration_s)
            non_default_values["expires"] = expires
        if "budget_duration" in non_default_values:
            duration_s = _duration_in_seconds(
                duration=non_default_values["budget_duration"]
            )
            key_reset_at = datetime.now(timezone.utc) + timedelta(seconds=duration_s)
            non_default_values["budget_reset_at"] = key_reset_at
        response = await prisma_client.update_data(
            token=key, data={**non_default_values, "token": key}
        )
--- a/litellm/proxy/management_endpoints/team_callback_endpoints.py
+++ b/litellm/proxy/management_endpoints/team_callback_endpoints.py
@ -0,0 +1,364 @@
 """
 Endpoints to control callbacks per team
 Use this when each team should control its own callbacks
 """
 import asyncio
 import copy
 import json
 import traceback
 import uuid
 from datetime import datetime, timedelta, timezone
 from typing import List, Optional
 import fastapi
 from fastapi import APIRouter, Depends, Header, HTTPException, Request, status
 import litellm
 from litellm._logging import verbose_proxy_logger
 from litellm.proxy._types import (
    AddTeamCallback,
    LiteLLM_TeamTable,
    ProxyErrorTypes,
    ProxyException,
    TeamCallbackMetadata,
    UserAPIKeyAuth,
 )
 from litellm.proxy.auth.user_api_key_auth import user_api_key_auth
 from litellm.proxy.management_helpers.utils import (
    add_new_member,
    management_endpoint_wrapper,
 )
 router = APIRouter()
@router.post(
    "/team/{team_id:path}/callback",
    tags=["team management"],
    dependencies=[Depends(user_api_key_auth)],
 )
@management_endpoint_wrapper
 async def add_team_callbacks(
    data: AddTeamCallback,
    http_request: Request,
    team_id: str,
    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
    litellm_changed_by: Optional[str] = Header(
        None,
        description="The litellm-changed-by header enables tracking of actions performed by authorized users on behalf of other users, providing an audit trail for accountability",
    ),
 ):
    """
    Add a success/failure callback to a team
    Use this if if you want different teams to have different success/failure callbacks
    Example curl:
    ```
    curl -X POST 'http:/localhost:4000/team/dbe2f686-a686-4896-864a-4c3924458709/callback' \
        -H 'Content-Type: application/json' \
        -H 'Authorization: Bearer sk-1234' \
        -d '{
        "callback_name": "langfuse",
        "callback_type": "success",
        "callback_vars": {"langfuse_public_key": "pk-lf-xxxx1", "langfuse_secret_key": "sk-xxxxx"}
    }'
    ```
    This means for the team where team_id = dbe2f686-a686-4896-864a-4c3924458709, all LLM calls will be logged to langfuse using the public key pk-lf-xxxx1 and the secret key sk-xxxxx
    """
    try:
        from litellm.proxy.proxy_server import (
            _duration_in_seconds,
            create_audit_log_for_update,
            litellm_proxy_admin_name,
            prisma_client,
        )
        if prisma_client is None:
            raise HTTPException(status_code=500, detail={"error": "No db connected"})
        # Check if team_id exists already
        _existing_team = await prisma_client.get_data(
            team_id=team_id, table_name="team", query_type="find_unique"
        )
        if _existing_team is None:
            raise HTTPException(
                status_code=400,
                detail={
                    "error": f"Team id = {team_id} does not exist. Please use a different team id."
                },
            )
        # store team callback settings in metadata
        team_metadata = _existing_team.metadata
        team_callback_settings = team_metadata.get("callback_settings", {})
        # expect callback settings to be
        team_callback_settings_obj = TeamCallbackMetadata(**team_callback_settings)
        if data.callback_type == "success":
            if team_callback_settings_obj.success_callback is None:
                team_callback_settings_obj.success_callback = []
            if data.callback_name in team_callback_settings_obj.success_callback:
                raise ProxyException(
                    message=f"callback_name = {data.callback_name} already exists in failure_callback, for team_id = {team_id}. \n Existing failure_callback = {team_callback_settings_obj.success_callback}",
                    code=status.HTTP_400_BAD_REQUEST,
                    type=ProxyErrorTypes.bad_request_error,
                    param="callback_name",
                )
            team_callback_settings_obj.success_callback.append(data.callback_name)
        elif data.callback_type == "failure":
            if team_callback_settings_obj.failure_callback is None:
                team_callback_settings_obj.failure_callback = []
            if data.callback_name in team_callback_settings_obj.failure_callback:
                raise ProxyException(
                    message=f"callback_name = {data.callback_name} already exists in failure_callback, for team_id = {team_id}. \n Existing failure_callback = {team_callback_settings_obj.failure_callback}",
                    code=status.HTTP_400_BAD_REQUEST,
                    type=ProxyErrorTypes.bad_request_error,
                    param="callback_name",
                )
            team_callback_settings_obj.failure_callback.append(data.callback_name)
        elif data.callback_type == "success_and_failure":
            if team_callback_settings_obj.success_callback is None:
                team_callback_settings_obj.success_callback = []
            if team_callback_settings_obj.failure_callback is None:
                team_callback_settings_obj.failure_callback = []
            if data.callback_name in team_callback_settings_obj.success_callback:
                raise ProxyException(
                    message=f"callback_name = {data.callback_name} already exists in success_callback, for team_id = {team_id}. \n Existing success_callback = {team_callback_settings_obj.success_callback}",
                    code=status.HTTP_400_BAD_REQUEST,
                    type=ProxyErrorTypes.bad_request_error,
                    param="callback_name",
                )
            if data.callback_name in team_callback_settings_obj.failure_callback:
                raise ProxyException(
                    message=f"callback_name = {data.callback_name} already exists in failure_callback, for team_id = {team_id}. \n Existing failure_callback = {team_callback_settings_obj.failure_callback}",
                    code=status.HTTP_400_BAD_REQUEST,
                    type=ProxyErrorTypes.bad_request_error,
                    param="callback_name",
                )
            team_callback_settings_obj.success_callback.append(data.callback_name)
            team_callback_settings_obj.failure_callback.append(data.callback_name)
        for var, value in data.callback_vars.items():
            if team_callback_settings_obj.callback_vars is None:
                team_callback_settings_obj.callback_vars = {}
            team_callback_settings_obj.callback_vars[var] = value
        team_callback_settings_obj_dict = team_callback_settings_obj.model_dump()
        team_metadata["callback_settings"] = team_callback_settings_obj_dict
        team_metadata_json = json.dumps(team_metadata)  # update team_metadata
        new_team_row = await prisma_client.db.litellm_teamtable.update(
            where={"team_id": team_id}, data={"metadata": team_metadata_json}  # type: ignore
        )
        return {
            "status": "success",
            "data": new_team_row,
        }
    except Exception as e:
        verbose_proxy_logger.error(
            "litellm.proxy.proxy_server.add_team_callbacks(): Exception occured - {}".format(
                str(e)
            )
        )
        verbose_proxy_logger.debug(traceback.format_exc())
        if isinstance(e, HTTPException):
            raise ProxyException(
                message=getattr(e, "detail", f"Internal Server Error({str(e)})"),
                type=ProxyErrorTypes.internal_server_error.value,
                param=getattr(e, "param", "None"),
                code=getattr(e, "status_code", status.HTTP_500_INTERNAL_SERVER_ERROR),
            )
        elif isinstance(e, ProxyException):
            raise e
        raise ProxyException(
            message="Internal Server Error, " + str(e),
            type=ProxyErrorTypes.internal_server_error.value,
            param=getattr(e, "param", "None"),
            code=status.HTTP_500_INTERNAL_SERVER_ERROR,
        )
@router.post(
    "/team/{team_id}/disable_logging",
    tags=["team management"],
    dependencies=[Depends(user_api_key_auth)],
 )
@management_endpoint_wrapper
 async def disable_team_logging(
    http_request: Request,
    team_id: str,
    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
 ):
    try:
        from litellm.proxy.proxy_server import prisma_client
        if prisma_client is None:
            raise HTTPException(status_code=500, detail={"error": "No db connected"})
        # Check if team exists
        _existing_team = await prisma_client.get_data(
            team_id=team_id, table_name="team", query_type="find_unique"
        )
        if _existing_team is None:
            raise HTTPException(
                status_code=404,
                detail={"error": f"Team id = {team_id} does not exist."},
            )
        # Update team metadata to disable logging
        team_metadata = _existing_team.metadata
        team_callback_settings = team_metadata.get("callback_settings", {})
        team_callback_settings_obj = TeamCallbackMetadata(**team_callback_settings)
        # Reset callbacks
        team_callback_settings_obj.success_callback = []
        team_callback_settings_obj.failure_callback = []
        # Update metadata
        team_metadata["callback_settings"] = team_callback_settings_obj.model_dump()
        team_metadata_json = json.dumps(team_metadata)
        # Update team in database
        updated_team = await prisma_client.db.litellm_teamtable.update(
            where={"team_id": team_id}, data={"metadata": team_metadata_json}  # type: ignore
        )
        if updated_team is None:
            raise HTTPException(
                status_code=404,
                detail={
                    "error": f"Team id = {team_id} does not exist. Error updating team logging"
                },
            )
        return {
            "status": "success",
            "message": f"Logging disabled for team {team_id}",
            "data": {
                "team_id": updated_team.team_id,
                "success_callbacks": [],
                "failure_callbacks": [],
            },
        }
    except Exception as e:
        verbose_proxy_logger.error(
            f"litellm.proxy.proxy_server.disable_team_logging(): Exception occurred - {str(e)}"
        )
        verbose_proxy_logger.debug(traceback.format_exc())
        if isinstance(e, HTTPException):
            raise ProxyException(
                message=getattr(e, "detail", f"Internal Server Error({str(e)})"),
                type=ProxyErrorTypes.internal_server_error.value,
                param=getattr(e, "param", "None"),
                code=getattr(e, "status_code", status.HTTP_500_INTERNAL_SERVER_ERROR),
            )
        elif isinstance(e, ProxyException):
            raise e
        raise ProxyException(
            message="Internal Server Error, " + str(e),
            type=ProxyErrorTypes.internal_server_error.value,
            param=getattr(e, "param", "None"),
            code=status.HTTP_500_INTERNAL_SERVER_ERROR,
        )
@router.get(
    "/team/{team_id:path}/callback",
    tags=["team management"],
    dependencies=[Depends(user_api_key_auth)],
 )
@management_endpoint_wrapper
 async def get_team_callbacks(
    http_request: Request,
    team_id: str,
    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
 ):
    """
    Get the success/failure callbacks and variables for a team
    Example curl:
    ```
    curl -X GET 'http://localhost:4000/team/dbe2f686-a686-4896-864a-4c3924458709/callback' \
        -H 'Authorization: Bearer sk-1234'
    ```
    This will return the callback settings for the team with id dbe2f686-a686-4896-864a-4c3924458709
    Returns {
            "status": "success",
            "data": {
                "team_id": team_id,
                "success_callbacks": team_callback_settings_obj.success_callback,
                "failure_callbacks": team_callback_settings_obj.failure_callback,
                "callback_vars": team_callback_settings_obj.callback_vars,
            },
        }
    """
    try:
        from litellm.proxy.proxy_server import prisma_client
        if prisma_client is None:
            raise HTTPException(status_code=500, detail={"error": "No db connected"})
        # Check if team_id exists
        _existing_team = await prisma_client.get_data(
            team_id=team_id, table_name="team", query_type="find_unique"
        )
        if _existing_team is None:
            raise HTTPException(
                status_code=404,
                detail={"error": f"Team id = {team_id} does not exist."},
            )
        # Retrieve team callback settings from metadata
        team_metadata = _existing_team.metadata
        team_callback_settings = team_metadata.get("callback_settings", {})
        # Convert to TeamCallbackMetadata object for consistent structure
        team_callback_settings_obj = TeamCallbackMetadata(**team_callback_settings)
        return {
            "status": "success",
            "data": {
                "team_id": team_id,
                "success_callbacks": team_callback_settings_obj.success_callback,
                "failure_callbacks": team_callback_settings_obj.failure_callback,
                "callback_vars": team_callback_settings_obj.callback_vars,
            },
        }
    except Exception as e:
        verbose_proxy_logger.error(
            "litellm.proxy.proxy_server.get_team_callbacks(): Exception occurred - {}".format(
                str(e)
            )
        )
        verbose_proxy_logger.debug(traceback.format_exc())
        if isinstance(e, HTTPException):
            raise ProxyException(
                message=getattr(e, "detail", f"Internal Server Error({str(e)})"),
                type=ProxyErrorTypes.internal_server_error.value,
                param=getattr(e, "param", "None"),
                code=getattr(e, "status_code", status.HTTP_500_INTERNAL_SERVER_ERROR),
            )
        elif isinstance(e, ProxyException):
            raise e
        raise ProxyException(
            message="Internal Server Error, " + str(e),
            type=ProxyErrorTypes.internal_server_error.value,
            param=getattr(e, "param", "None"),
            code=status.HTTP_500_INTERNAL_SERVER_ERROR,
        )
--- a/litellm/proxy/management_endpoints/team_endpoints.py
+++ b/litellm/proxy/management_endpoints/team_endpoints.py
@ -363,6 +363,7 @@ async def update_team(
        # set the budget_reset_at in DB
        updated_kv["budget_reset_at"] = reset_at
    updated_kv = prisma_client.jsonify_object(data=updated_kv)
    team_row: Optional[
        LiteLLM_TeamTable
    ] = await prisma_client.db.litellm_teamtable.update(
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@ -1,10 +1,21 @@
 model_list:
  - model_name: gpt-4
    litellm_params:
      model: openai/fake
      api_key: fake-key
      api_base: https://exampleopenaiendpoint-production.up.railway.app/
  - model_name: fireworks-llama-v3-70b-instruct
    litellm_params:
      model: fireworks_ai/accounts/fireworks/models/llama-v3-70b-instruct
-      api_key: "os.environ/FIREWORKS_AI_API_KEY"
+      api_key: "os.environ/FIREWORKS"
 router_settings:
  enable_tag_filtering: True # 👈 Key Change
 general_settings: 
-  master_key: sk-1234
+  master_key: sk-1234
  alerting: ["slack"]
  alerting_threshold: 0.0001
  alert_to_webhook_url: {
    "llm_too_slow": "https://hooks.slack.com/services/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",
    "llm_requests_hanging": "https://hooks.slack.com/services/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",
  }
 litellm_settings:
  success_callback: ["langfuse"]
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -170,6 +170,9 @@ from litellm.proxy.management_endpoints.key_management_endpoints import (
 from litellm.proxy.management_endpoints.key_management_endpoints import (
    router as key_management_router,
 )
 from litellm.proxy.management_endpoints.team_callback_endpoints import (
    router as team_callback_router,
 )
 from litellm.proxy.management_endpoints.team_endpoints import router as team_router
 from litellm.proxy.openai_files_endpoints.files_endpoints import (
    router as openai_files_router,
@ -654,7 +657,11 @@ async def _PROXY_track_cost_callback(
    global prisma_client, custom_db_client
    try:
        # check if it has collected an entire stream response
-        verbose_proxy_logger.debug("Proxy: In track_cost_callback for: %s", kwargs)
+        verbose_proxy_logger.debug(
            "Proxy: In track_cost_callback for: kwargs=%s and completion_response: %s",
            kwargs,
            completion_response,
        )
        verbose_proxy_logger.debug(
            f"kwargs stream: {kwargs.get('stream', None)} + complete streaming response: {kwargs.get('complete_streaming_response', None)}"
        )
@ -1620,6 +1627,7 @@ class ProxyConfig:
                alerting=general_settings.get("alerting", None),
                alerting_threshold=general_settings.get("alerting_threshold", 600),
                alert_types=general_settings.get("alert_types", None),
                alert_to_webhook_url=general_settings.get("alert_to_webhook_url", None),
                alerting_args=general_settings.get("alerting_args", None),
                redis_cache=redis_usage_cache,
            )
@ -2905,6 +2913,7 @@ async def chat_completion(
        fastest_response_batch_completion = hidden_params.get(
            "fastest_response_batch_completion", None
        )
        additional_headers: dict = hidden_params.get("additional_headers", {}) or {}
        # Post Call Processing
        if llm_router is not None:
@ -2927,6 +2936,7 @@ async def chat_completion(
                response_cost=response_cost,
                model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
                fastest_response_batch_completion=fastest_response_batch_completion,
                **additional_headers,
            )
            selected_data_generator = select_data_generator(
                response=response,
@ -2944,8 +2954,10 @@ async def chat_completion(
            user_api_key_dict=user_api_key_dict, response=response
        )
-        hidden_params = getattr(response, "_hidden_params", {}) or {}
+        hidden_params = (
-        additional_headers: dict = hidden_params.get("additional_headers", {}) or {}
+            getattr(response, "_hidden_params", {}) or {}
        )  # get any updated response headers
        additional_headers = hidden_params.get("additional_headers", {}) or {}
        fastapi_response.headers.update(
            get_custom_headers(
@ -9457,3 +9469,4 @@ app.include_router(analytics_router)
 app.include_router(debugging_endpoints_router)
 app.include_router(ui_crud_endpoints_router)
 app.include_router(openai_files_router)
 app.include_router(team_callback_router)
--- a/litellm/proxy/schema.prisma
+++ b/litellm/proxy/schema.prisma
@ -183,12 +183,12 @@ model LiteLLM_SpendLogs {
  model               String   @default("")
  model_id            String?   @default("") // the model id stored in proxy model db
  model_group         String?   @default("") // public model_name / model_group
-  api_base            String   @default("")
+  api_base            String?   @default("")
-  user                String   @default("")
+  user                String?   @default("")
-  metadata            Json     @default("{}")
+  metadata            Json?     @default("{}")
-  cache_hit           String   @default("")
+  cache_hit           String?   @default("")
-  cache_key           String   @default("")
+  cache_key           String?   @default("")
-  request_tags        Json     @default("[]")
+  request_tags        Json?     @default("[]")
  team_id             String? 
  end_user            String?
  requester_ip_address String?
@ -257,4 +257,4 @@ model LiteLLM_AuditLog {
  object_id          String      // id of the object being audited. This can be the key id, team id, user id, model id
  before_value       Json?       // value of the row 
  updated_values     Json?       // value of the row after change
-}
+}
--- a/litellm/proxy/tests/test_anthropic_sdk.py
+++ b/litellm/proxy/tests/test_anthropic_sdk.py
@ -0,0 +1,22 @@
 import os
 from anthropic import Anthropic
 client = Anthropic(
    # This is the default and can be omitted
    base_url="http://localhost:4000",
    # this is a litellm proxy key :) - not a real anthropic key
    api_key="sk-s4xN1IiLTCytwtZFJaYQrA",
 )
 message = client.messages.create(
    max_tokens=1024,
    messages=[
        {
            "role": "user",
            "content": "Hello, Claude",
        }
    ],
    model="claude-3-opus-20240229",
 )
 print(message.content)
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@ -25,7 +25,7 @@ from typing_extensions import overload
 import litellm
 import litellm.litellm_core_utils
 import litellm.litellm_core_utils.litellm_logging
-from litellm import EmbeddingResponse, ImageResponse, ModelResponse
+from litellm import EmbeddingResponse, ImageResponse, ModelResponse, get_litellm_params
 from litellm._logging import verbose_proxy_logger
 from litellm._service_logger import ServiceLogging, ServiceTypes
 from litellm.caching import DualCache, RedisCache
@ -50,7 +50,7 @@ from litellm.proxy.hooks.max_budget_limiter import _PROXY_MaxBudgetLimiter
 from litellm.proxy.hooks.parallel_request_limiter import (
    _PROXY_MaxParallelRequestsHandler,
 )
-from litellm.types.utils import CallTypes
+from litellm.types.utils import CallTypes, LoggedLiteLLMParams
 if TYPE_CHECKING:
    from opentelemetry.trace import Span as _Span
@ -188,6 +188,7 @@ class ProxyLogging:
            "new_model_added",
            "outage_alerts",
        ]
        self.alert_to_webhook_url: Optional[dict] = None
        self.slack_alerting_instance: SlackAlerting = SlackAlerting(
            alerting_threshold=self.alerting_threshold,
            alerting=self.alerting,
@ -202,6 +203,7 @@ class ProxyLogging:
        redis_cache: Optional[RedisCache] = None,
        alert_types: Optional[List[AlertType]] = None,
        alerting_args: Optional[dict] = None,
        alert_to_webhook_url: Optional[dict] = None,
    ):
        updated_slack_alerting: bool = False
        if alerting is not None:
@ -213,6 +215,9 @@ class ProxyLogging:
        if alert_types is not None:
            self.alert_types = alert_types
            updated_slack_alerting = True
        if alert_to_webhook_url is not None:
            self.alert_to_webhook_url = alert_to_webhook_url
            updated_slack_alerting = True
        if updated_slack_alerting is True:
            self.slack_alerting_instance.update_values(
@ -220,6 +225,7 @@ class ProxyLogging:
                alerting_threshold=self.alerting_threshold,
                alert_types=self.alert_types,
                alerting_args=alerting_args,
                alert_to_webhook_url=self.alert_to_webhook_url,
            )
            if (
@ -602,14 +608,20 @@ class ProxyLogging:
            if litellm_logging_obj is not None:
                ## UPDATE LOGGING INPUT
                _optional_params = {}
                _litellm_params = {}
                litellm_param_keys = LoggedLiteLLMParams.__annotations__.keys()
                for k, v in request_data.items():
-                    if k != "model" and k != "user" and k != "litellm_params":
+                    if k in litellm_param_keys:
                        _litellm_params[k] = v
                    elif k != "model" and k != "user":
                        _optional_params[k] = v
                litellm_logging_obj.update_environment_variables(
                    model=request_data.get("model", ""),
                    user=request_data.get("user", ""),
                    optional_params=_optional_params,
-                    litellm_params=request_data.get("litellm_params", {}),
+                    litellm_params=_litellm_params,
                )
                input: Union[list, str, dict] = ""
@ -832,6 +844,30 @@ class PrismaClient:
        If the view doesn't exist, one will be created.
        """
        # Check to see if all of the necessary views exist and if they do, simply return
        # This is more efficient because it lets us check for all views in one
        # query instead of multiple queries.
        try:
            ret = await self.db.query_raw(
                """
                    SELECT SUM(1) FROM pg_views
                    WHERE schemaname = 'public' AND viewname IN (
                        'LiteLLM_VerificationTokenView',
                        'MonthlyGlobalSpend',
                        'Last30dKeysBySpend',
                        'Last30dModelsBySpend',
                        'MonthlyGlobalSpendPerKey',
                        'Last30dTopEndUsersSpend'
                    )
                    """
            )
            if ret[0]['sum'] == 6:
                print("All necessary views exist!")  # noqa
                return
        except Exception:
            pass
        try:
            # Try to select one row from the view
            await self.db.query_raw(
@ -1313,8 +1349,10 @@ class PrismaClient:
                    t.tpm_limit AS team_tpm_limit,
                    t.rpm_limit AS team_rpm_limit,
                    t.models AS team_models,
                    t.metadata AS team_metadata,
                    t.blocked AS team_blocked,
                    t.team_alias AS team_alias,
                    t.metadata AS team_metadata,
                    tm.spend AS team_member_spend,
                    m.aliases as team_model_aliases
                    FROM "LiteLLM_VerificationToken" AS v
--- a/litellm/tests/test_amazing_vertex_completion.py
+++ b/litellm/tests/test_amazing_vertex_completion.py
@ -895,6 +895,52 @@ async def test_gemini_pro_function_calling_httpx(model, sync_mode):
            pytest.fail("An unexpected exception occurred - {}".format(str(e)))
 from litellm.tests.test_completion import response_format_tests
@pytest.mark.parametrize(
    "model", ["vertex_ai/meta/llama3-405b-instruct-maas"]
 )  # "vertex_ai",
@pytest.mark.parametrize("sync_mode", [True, False])  # "vertex_ai",
@pytest.mark.asyncio
 async def test_llama_3_httpx(model, sync_mode):
    try:
        load_vertex_ai_credentials()
        litellm.set_verbose = True
        messages = [
            {
                "role": "system",
                "content": "Your name is Litellm Bot, you are a helpful assistant",
            },
            # User asks for their name and weather in San Francisco
            {
                "role": "user",
                "content": "Hello, what is your name and can you tell me the weather?",
            },
        ]
        data = {
            "model": model,
            "messages": messages,
        }
        if sync_mode:
            response = litellm.completion(**data)
        else:
            response = await litellm.acompletion(**data)
        response_format_tests(response=response)
        print(f"response: {response}")
    except litellm.RateLimitError as e:
        pass
    except Exception as e:
        if "429 Quota exceeded" in str(e):
            pass
        else:
            pytest.fail("An unexpected exception occurred - {}".format(str(e)))
 def vertex_httpx_mock_reject_prompt_post(*args, **kwargs):
    mock_response = MagicMock()
    mock_response.status_code = 200
--- a/litellm/tests/test_anthropic_completion.py
+++ b/litellm/tests/test_anthropic_completion.py
@ -48,6 +48,42 @@ def test_anthropic_completion_input_translation():
    ]
 def test_anthropic_completion_input_translation_with_metadata():
    """
    Tests that cost tracking works as expected with LiteLLM Proxy
    LiteLLM Proxy will insert litellm_metadata for anthropic endpoints to track user_api_key and user_api_key_team_id
    This test ensures that the `litellm_metadata` is not present in the translated input
    It ensures that `litellm.acompletion()` will receieve metadata which is a litellm specific param
    """
    data = {
        "model": "gpt-3.5-turbo",
        "messages": [{"role": "user", "content": "Hey, how's it going?"}],
        "litellm_metadata": {
            "user_api_key": "88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",
            "user_api_key_alias": None,
            "user_api_end_user_max_budget": None,
            "litellm_api_version": "1.40.19",
            "global_max_parallel_requests": None,
            "user_api_key_user_id": "default_user_id",
            "user_api_key_org_id": None,
            "user_api_key_team_id": None,
            "user_api_key_team_alias": None,
            "user_api_key_team_max_budget": None,
            "user_api_key_team_spend": None,
            "user_api_key_spend": 0.0,
            "user_api_key_max_budget": None,
            "user_api_key_metadata": {},
        },
    }
    translated_input = anthropic_adapter.translate_completion_input_params(kwargs=data)
    assert "litellm_metadata" not in translated_input
    assert "metadata" in translated_input
    assert translated_input["metadata"] == data["litellm_metadata"]
 def test_anthropic_completion_e2e():
    litellm.set_verbose = True
--- a/litellm/tests/test_arize_ai.py
+++ b/litellm/tests/test_arize_ai.py
@ -0,0 +1,29 @@
 import asyncio
 import logging
 import os
 import time
 import pytest
 from dotenv import load_dotenv
 from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
 import litellm
 from litellm._logging import verbose_logger
 from litellm.integrations.opentelemetry import OpenTelemetry, OpenTelemetryConfig
 load_dotenv()
 import logging
@pytest.mark.asyncio()
 async def test_async_otel_callback():
    litellm.set_verbose = True
    litellm.success_callback = ["arize"]
    await litellm.acompletion(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": "hi test from local arize"}],
        mock_response="hello",
        temperature=0.1,
        user="OTEL_USER",
    )
--- a/litellm/tests/test_bad_params.py
+++ b/litellm/tests/test_bad_params.py
@ -2,18 +2,19 @@
 #    This tests chaos monkeys - if random parts of the system are broken / things aren't sent correctly - what happens.
 #    Expect to add more edge cases to this over time.
-import sys, os
+import os
 import sys
 import traceback
 import pytest
 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import litellm
-from litellm import embedding, completion
+from litellm import completion, embedding
 from litellm.utils import Message
 # litellm.set_verbose = True
 user_message = "Hello, how are you?"
 messages = [{"content": user_message, "role": "user"}]
@ -74,6 +75,8 @@ def test_completion_invalid_param_cohere():
        response = completion(model="command-nightly", messages=messages, seed=12)
        pytest.fail(f"This should have failed cohere does not support `seed` parameter")
    except Exception as e:
        assert isinstance(e, litellm.UnsupportedParamsError)
        print("got an exception=", str(e))
        if " cohere does not support parameters: {'seed': 12}" in str(e):
            pass
        else:
--- a/litellm/tests/test_braintrust.py
+++ b/litellm/tests/test_braintrust.py
@ -0,0 +1,53 @@
 # What is this?
 ## This tests the braintrust integration
 import asyncio
 import os
 import random
 import sys
 import time
 import traceback
 from datetime import datetime
 from dotenv import load_dotenv
 from fastapi import Request
 load_dotenv()
 import os
 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import asyncio
 import logging
 from unittest.mock import AsyncMock, MagicMock, patch
 import pytest
 import litellm
 from litellm.llms.custom_httpx.http_handler import HTTPHandler
 def test_braintrust_logging():
    import litellm
    http_client = HTTPHandler()
    setattr(
        litellm.integrations.braintrust_logging,
        "global_braintrust_sync_http_handler",
        http_client,
    )
    with patch.object(http_client, "post", new=MagicMock()) as mock_client:
        # set braintrust as a callback, litellm will send the data to braintrust
        litellm.callbacks = ["braintrust"]
        # openai call
        response = litellm.completion(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}],
        )
        mock_client.assert_called()
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -346,7 +346,7 @@ def test_completion_claude_3_empty_response():
    messages = [
        {
            "role": "system",
-            "content": "You are 2twNLGfqk4GMOn3ffp4p.",
+            "content": [{"type": "text", "text": "You are 2twNLGfqk4GMOn3ffp4p."}],
        },
        {"role": "user", "content": "Hi gm!", "name": "ishaan"},
        {"role": "assistant", "content": "Good morning! How are you doing today?"},
@ -1364,6 +1364,12 @@ def test_completion_openai_response_headers():
    print("response_headers=", response._response_headers)
    assert response._response_headers is not None
    assert "x-ratelimit-remaining-tokens" in response._response_headers
    assert isinstance(
        response._hidden_params["additional_headers"][
            "llm_provider-x-ratelimit-remaining-requests"
        ],
        str,
    )
    # /chat/completion - with streaming
@ -1376,6 +1382,12 @@ def test_completion_openai_response_headers():
    print("streaming response_headers=", response_headers)
    assert response_headers is not None
    assert "x-ratelimit-remaining-tokens" in response_headers
    assert isinstance(
        response._hidden_params["additional_headers"][
            "llm_provider-x-ratelimit-remaining-requests"
        ],
        str,
    )
    for chunk in streaming_response:
        print("chunk=", chunk)
@ -1390,6 +1402,12 @@ def test_completion_openai_response_headers():
    print("embedding_response_headers=", embedding_response_headers)
    assert embedding_response_headers is not None
    assert "x-ratelimit-remaining-tokens" in embedding_response_headers
    assert isinstance(
        response._hidden_params["additional_headers"][
            "llm_provider-x-ratelimit-remaining-requests"
        ],
        str,
    )
    litellm.return_response_headers = False
@ -2542,6 +2560,71 @@ def test_completion_anyscale_with_functions():
 # test_completion_anyscale_with_functions()
 def test_completion_azure_extra_headers():
    # this tests if we can pass api_key to completion, when it's not in the env.
    # DO NOT REMOVE THIS TEST. No MATTER WHAT Happens!
    # If you want to remove it, speak to Ishaan!
    # Ishaan will be very disappointed if this test is removed -> this is a standard way to pass api_key + the router + proxy use this
    from httpx import Client
    from openai import AzureOpenAI
    from litellm.llms.custom_httpx.httpx_handler import HTTPHandler
    http_client = Client()
    with patch.object(http_client, "send", new=MagicMock()) as mock_client:
        litellm.client_session = http_client
        try:
            response = completion(
                model="azure/chatgpt-v-2",
                messages=messages,
                api_base=os.getenv("AZURE_API_BASE"),
                api_version="2023-07-01-preview",
                api_key=os.getenv("AZURE_API_KEY"),
                extra_headers={
                    "Authorization": "my-bad-key",
                    "Ocp-Apim-Subscription-Key": "hello-world-testing",
                },
            )
            print(response)
            pytest.fail("Expected this to fail")
        except Exception as e:
            pass
        mock_client.assert_called()
        print(f"mock_client.call_args: {mock_client.call_args}")
        request = mock_client.call_args[0][0]
        print(request.method)  # This will print 'POST'
        print(request.url)  # This will print the full URL
        print(request.headers)  # This will print the full URL
        auth_header = request.headers.get("Authorization")
        apim_key = request.headers.get("Ocp-Apim-Subscription-Key")
        print(auth_header)
        assert auth_header == "my-bad-key"
        assert apim_key == "hello-world-testing"
 def test_completion_azure_ad_token():
    # this tests if we can pass api_key to completion, when it's not in the env.
    # DO NOT REMOVE THIS TEST. No MATTER WHAT Happens!
    # If you want to remove it, speak to Ishaan!
    # Ishaan will be very disappointed if this test is removed -> this is a standard way to pass api_key + the router + proxy use this
    from httpx import Client
    from openai import AzureOpenAI
    from litellm import completion
    from litellm.llms.custom_httpx.httpx_handler import HTTPHandler
    response = completion(
        model="azure/chatgpt-v-2",
        messages=messages,
        # api_key="my-fake-ad-token",
        azure_ad_token=os.getenv("AZURE_API_KEY"),
    )
    print(response)
 def test_completion_azure_key_completion_arg():
    # this tests if we can pass api_key to completion, when it's not in the env.
    # DO NOT REMOVE THIS TEST. No MATTER WHAT Happens!
--- a/litellm/tests/test_completion_cost.py
+++ b/litellm/tests/test_completion_cost.py
@ -881,6 +881,7 @@ def test_completion_azure_ai():
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
 async def test_completion_cost_hidden_params(sync_mode):
    litellm.return_response_headers = True
    if sync_mode:
        response = litellm.completion(
            model="gpt-3.5-turbo",
--- a/litellm/tests/test_custom_callback_input.py
+++ b/litellm/tests/test_custom_callback_input.py
@ -235,6 +235,7 @@ class CompletionCustomHandler(
            assert isinstance(kwargs["optional_params"], dict)
            assert isinstance(kwargs["litellm_params"], dict)
            assert isinstance(kwargs["litellm_params"]["metadata"], Optional[dict])
            assert isinstance(kwargs["start_time"], (datetime, type(None)))
            assert isinstance(kwargs["stream"], bool)
            assert isinstance(kwargs["user"], (str, type(None)))
--- a/litellm/tests/test_embedding.py
+++ b/litellm/tests/test_embedding.py
@ -197,6 +197,29 @@ def test_openai_azure_embedding():
        pytest.fail(f"Error occurred: {e}")
@pytest.mark.skipif(
    os.environ.get("CIRCLE_OIDC_TOKEN") is None,
    reason="Cannot run without being in CircleCI Runner",
 )
 def test_openai_azure_embedding_with_oidc_and_cf():
    # TODO: Switch to our own Azure account, currently using ai.moda's account
    os.environ["AZURE_TENANT_ID"] = "17c0a27a-1246-4aa1-a3b6-d294e80e783c"
    os.environ["AZURE_CLIENT_ID"] = "4faf5422-b2bd-45e8-a6d7-46543a38acd0"
    try:
        response = embedding(
            model="azure/text-embedding-ada-002",
            input=["Hello"],
            azure_ad_token="oidc/circleci/",
            api_base="https://eastus2-litellm.openai.azure.com/",
            api_version="2024-06-01",
        )
        print(response)
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 def test_openai_azure_embedding_optional_arg(mocker):
    mocked_create_embeddings = mocker.patch.object(
        openai.resources.embeddings.Embeddings,
@ -650,3 +673,17 @@ async def test_databricks_embeddings(sync_mode):
 #     print(response)
 # local_proxy_embeddings()
 def test_embedding_azure_ad_token():
    # this tests if we can pass api_key to completion, when it's not in the env.
    # DO NOT REMOVE THIS TEST. No MATTER WHAT Happens!
    # If you want to remove it, speak to Ishaan!
    # Ishaan will be very disappointed if this test is removed -> this is a standard way to pass api_key + the router + proxy use this
    response = embedding(
        model="azure/azure-embedding-model",
        input=["good morning from litellm"],
        azure_ad_token=os.getenv("AZURE_API_KEY"),
    )
    print(response)
--- a/litellm/tests/test_exceptions.py
+++ b/litellm/tests/test_exceptions.py
@ -64,6 +64,30 @@ async def test_content_policy_exception_azure():
        pytest.fail(f"An exception occurred - {str(e)}")
@pytest.mark.asyncio
 async def test_content_policy_exception_openai():
    try:
        # this is ony a test - we needed some way to invoke the exception :(
        litellm.set_verbose = True
        response = await litellm.acompletion(
            model="gpt-3.5-turbo-0613",
            stream=True,
            messages=[
                {"role": "user", "content": "Gimme the lyrics to Don't Stop Me Now"}
            ],
        )
        async for chunk in response:
            print(chunk)
    except litellm.ContentPolicyViolationError as e:
        print("caught a content policy violation error! Passed")
        print("exception", e)
        assert e.llm_provider == "openai"
        pass
    except Exception as e:
        print()
        pytest.fail(f"An exception occurred - {str(e)}")
 # Test 1: Context Window Errors
@pytest.mark.skip(reason="AWS Suspended Account")
@pytest.mark.parametrize("model", exception_models)
--- a/litellm/tests/test_langsmith.py
+++ b/litellm/tests/test_langsmith.py
@ -36,6 +36,7 @@ async def test_async_langsmith_logging():
            temperature=0.2,
            metadata={
                "id": run_id,
                "tags": ["tag1", "tag2"],
                "user_api_key": "6eb81e014497d89f3cc1aa9da7c2b37bda6b7fea68e4b710d33d94201e68970c",
                "user_api_key_alias": "ishaans-langmsith-key",
                "user_api_end_user_max_budget": None,
--- a/litellm/tests/test_optional_params.py
+++ b/litellm/tests/test_optional_params.py
@ -128,6 +128,19 @@ def test_azure_ai_mistral_optional_params():
    assert "user" not in optional_params
 def test_vertex_ai_llama_3_optional_params():
    litellm.vertex_llama3_models = ["meta/llama3-405b-instruct-maas"]
    litellm.drop_params = True
    optional_params = get_optional_params(
        model="meta/llama3-405b-instruct-maas",
        user="John",
        custom_llm_provider="vertex_ai",
        max_tokens=10,
        temperature=0.2,
    )
    assert "user" not in optional_params
 def test_azure_gpt_optional_params_gpt_vision():
    # for OpenAI, Azure all extra params need to get passed as extra_body to OpenAI python. We assert we actually set extra_body here
    optional_params = litellm.utils.get_optional_params(
--- a/litellm/tests/test_prompt_factory.py
+++ b/litellm/tests/test_prompt_factory.py
@ -212,7 +212,7 @@ def test_convert_url_to_img():
    [
        ("data:image/jpeg;base64,1234", "image/jpeg"),
        ("data:application/pdf;base64,1234", "application/pdf"),
-        ("data:image\/jpeg;base64,1234", "image/jpeg"),
+        (r"data:image\/jpeg;base64,1234", "image/jpeg"),
    ],
 )
 def test_base64_image_input(url, expected_media_type):
--- a/litellm/tests/test_proxy_routes.py
+++ b/litellm/tests/test_proxy_routes.py
@ -19,7 +19,7 @@ import pytest
 import litellm
 from litellm.proxy._types import LiteLLMRoutes
-from litellm.proxy.auth.auth_utils import is_openai_route
+from litellm.proxy.auth.auth_utils import is_llm_api_route
 from litellm.proxy.proxy_server import app
 # Configure logging
@ -77,8 +77,8 @@ def test_routes_on_litellm_proxy():
        ("/v1/non_existent_endpoint", False),
    ],
 )
-def test_is_openai_route(route: str, expected: bool):
+def test_is_llm_api_route(route: str, expected: bool):
-    assert is_openai_route(route) == expected
+    assert is_llm_api_route(route) == expected
 # Test case for routes that are similar but should return False
@ -91,5 +91,10 @@ def test_is_openai_route(route: str, expected: bool):
        "/engines/model/invalid/completions",
    ],
 )
-def test_is_openai_route_similar_but_false(route: str):
+def test_is_llm_api_route_similar_but_false(route: str):
-    assert is_openai_route(route) == False
+    assert is_llm_api_route(route) == False
 def test_anthropic_api_routes():
    # allow non proxy admins to call anthropic api routes
    assert is_llm_api_route(route="/v1/messages") is True
--- a/litellm/tests/test_proxy_server.py
+++ b/litellm/tests/test_proxy_server.py
@ -173,6 +173,63 @@ def test_chat_completion(mock_acompletion, client_no_auth):
        pytest.fail(f"LiteLLM Proxy test failed. Exception - {str(e)}")
@mock_patch_acompletion()
@pytest.mark.asyncio
 async def test_team_disable_guardrails(mock_acompletion, client_no_auth):
    """
    If team not allowed to turn on/off guardrails
    Raise 403 forbidden error, if request is made by team on `/key/generate` or `/chat/completions`.
    """
    import asyncio
    import json
    import time
    from fastapi import HTTPException, Request
    from starlette.datastructures import URL
    from litellm.proxy._types import LiteLLM_TeamTable, ProxyException, UserAPIKeyAuth
    from litellm.proxy.auth.user_api_key_auth import user_api_key_auth
    from litellm.proxy.proxy_server import hash_token, user_api_key_cache
    _team_id = "1234"
    user_key = "sk-12345678"
    valid_token = UserAPIKeyAuth(
        team_id=_team_id,
        team_blocked=True,
        token=hash_token(user_key),
        last_refreshed_at=time.time(),
    )
    await asyncio.sleep(1)
    team_obj = LiteLLM_TeamTable(
        team_id=_team_id,
        blocked=False,
        last_refreshed_at=time.time(),
        metadata={"guardrails": {"modify_guardrails": False}},
    )
    user_api_key_cache.set_cache(key=hash_token(user_key), value=valid_token)
    user_api_key_cache.set_cache(key="team_id:{}".format(_team_id), value=team_obj)
    setattr(litellm.proxy.proxy_server, "user_api_key_cache", user_api_key_cache)
    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
    setattr(litellm.proxy.proxy_server, "prisma_client", "hello-world")
    request = Request(scope={"type": "http"})
    request._url = URL(url="/chat/completions")
    body = {"metadata": {"guardrails": {"hide_secrets": False}}}
    json_bytes = json.dumps(body).encode("utf-8")
    request._body = json_bytes
    try:
        await user_api_key_auth(request=request, api_key="Bearer " + user_key)
        pytest.fail("Expected to raise 403 forbidden error.")
    except ProxyException as e:
        assert e.code == 403
 from litellm.tests.test_custom_callback_input import CompletionCustomHandler
--- a/litellm/tests/test_secret_manager.py
+++ b/litellm/tests/test_secret_manager.py
@ -12,6 +12,8 @@ sys.path.insert(
 import pytest
 from litellm import get_secret
 from litellm.proxy.secret_managers.aws_secret_manager import load_aws_secret_manager
 from litellm.llms.azure import get_azure_ad_token_from_oidc
 from litellm.llms.bedrock_httpx import BedrockLLM
@pytest.mark.skip(reason="AWS Suspended Account")
@ -60,7 +62,7 @@ def test_oidc_github():
 )
 def test_oidc_circleci():
    secret_val = get_secret(
-        "oidc/circleci/https://bedrock-runtime.us-east-1.amazonaws.com/model/amazon.titan-text-express-v1/invoke"
+        "oidc/circleci/"
    )
    print(f"secret_val: {redact_oidc_signature(secret_val)}")
@ -76,3 +78,38 @@ def test_oidc_circleci_v2():
    )
    print(f"secret_val: {redact_oidc_signature(secret_val)}")
@pytest.mark.skipif(
    os.environ.get("CIRCLE_OIDC_TOKEN") is None,
    reason="Cannot run without being in CircleCI Runner",
 )
 def test_oidc_circleci_with_azure():
    # TODO: Switch to our own Azure account, currently using ai.moda's account
    os.environ["AZURE_TENANT_ID"] = "17c0a27a-1246-4aa1-a3b6-d294e80e783c"
    os.environ["AZURE_CLIENT_ID"] = "4faf5422-b2bd-45e8-a6d7-46543a38acd0"
    azure_ad_token = get_azure_ad_token_from_oidc("oidc/circleci/")
    print(f"secret_val: {redact_oidc_signature(azure_ad_token)}")
@pytest.mark.skipif(
    os.environ.get("CIRCLE_OIDC_TOKEN") is None,
    reason="Cannot run without being in CircleCI Runner",
 )
 def test_oidc_circle_v1_with_amazon():
    # The purpose of this test is to get logs using the older v1 of the CircleCI OIDC token
    # TODO: This is using ai.moda's IAM role, we should use LiteLLM's IAM role eventually
    aws_role_name = (
        "arn:aws:iam::335785316107:role/litellm-github-unit-tests-circleci-v1-assume-only"
    )
    aws_web_identity_token = "oidc/circleci/"
    bllm = BedrockLLM()
    creds = bllm.get_credentials(
        aws_region_name="ca-west-1",
        aws_web_identity_token=aws_web_identity_token,
        aws_role_name=aws_role_name,
        aws_session_name="assume-v1-session",
    )
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@ -1988,25 +1988,30 @@ async def test_hf_completion_tgi_stream():
 # test on openai completion call
 def test_openai_chat_completion_call():
-    try:
+    litellm.set_verbose = False
-        litellm.set_verbose = False
+    litellm.return_response_headers = True
-        print(f"making openai chat completion call")
+    print(f"making openai chat completion call")
-        response = completion(model="gpt-3.5-turbo", messages=messages, stream=True)
+    response = completion(model="gpt-3.5-turbo", messages=messages, stream=True)
-        complete_response = ""
+    assert isinstance(
-        start_time = time.time()
+        response._hidden_params["additional_headers"][
-        for idx, chunk in enumerate(response):
+            "llm_provider-x-ratelimit-remaining-requests"
-            chunk, finished = streaming_format_tests(idx, chunk)
+        ],
-            print(f"outside chunk: {chunk}")
+        str,
-            if finished:
+    )
-                break
+
-            complete_response += chunk
+    print(f"response._hidden_params: {response._hidden_params}")
-            # print(f'complete_chunk: {complete_response}')
+    complete_response = ""
-        if complete_response.strip() == "":
+    start_time = time.time()
-            raise Exception("Empty response received")
+    for idx, chunk in enumerate(response):
-        print(f"complete response: {complete_response}")
+        chunk, finished = streaming_format_tests(idx, chunk)
-    except:
+        print(f"outside chunk: {chunk}")
-        print(f"error occurred: {traceback.format_exc()}")
+        if finished:
-        pass
+            break
        complete_response += chunk
        # print(f'complete_chunk: {complete_response}')
    if complete_response.strip() == "":
        raise Exception("Empty response received")
    print(f"complete response: {complete_response}")
 # test_openai_chat_completion_call()
--- a/litellm/types/llms/anthropic.py
+++ b/litellm/types/llms/anthropic.py
@ -1,4 +1,4 @@
-from typing import Iterable, List, Optional, Union
+from typing import Any, Dict, Iterable, List, Optional, Union
 from pydantic import BaseModel, validator
 from typing_extensions import Literal, Required, TypedDict
@ -113,6 +113,9 @@ class AnthropicMessagesRequest(TypedDict, total=False):
    top_k: int
    top_p: float
    # litellm param - used for tracking litellm proxy metadata in the request
    litellm_metadata: dict
 class ContentTextBlockDelta(TypedDict):
    """
--- a/litellm/types/llms/openai.py
+++ b/litellm/types/llms/openai.py
@ -436,6 +436,7 @@ class ChatCompletionRequest(TypedDict, total=False):
    function_call: Union[str, dict]
    functions: List
    user: str
    metadata: dict  # litellm specific param
 class ChatCompletionDeltaChunk(TypedDict, total=False):
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@ -1029,3 +1029,22 @@ class GenericImageParsingChunk(TypedDict):
 class ResponseFormatChunk(TypedDict, total=False):
    type: Required[Literal["json_object", "text"]]
    response_schema: dict
 class LoggedLiteLLMParams(TypedDict, total=False):
    force_timeout: Optional[float]
    custom_llm_provider: Optional[str]
    api_base: Optional[str]
    litellm_call_id: Optional[str]
    model_alias_map: Optional[dict]
    metadata: Optional[dict]
    model_info: Optional[dict]
    proxy_server_request: Optional[dict]
    acompletion: Optional[bool]
    preset_cache_key: Optional[str]
    no_log: Optional[bool]
    input_cost_per_second: Optional[float]
    input_cost_per_token: Optional[float]
    output_cost_per_token: Optional[float]
    output_cost_per_second: Optional[float]
    cooldown_time: Optional[float]
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -129,6 +129,7 @@ from .exceptions import (
    ServiceUnavailableError,
    Timeout,
    UnprocessableEntityError,
    UnsupportedParamsError,
 )
 from .proxy._types import KeyManagementSystem
 from .types.llms.openai import (
@ -158,6 +159,7 @@ from typing import (
    Tuple,
    Union,
    cast,
    get_args,
 )
 from .caching import Cache
@ -224,17 +226,6 @@ last_fetched_at_keys = None
 # }
 class UnsupportedParamsError(Exception):
    def __init__(self, status_code, message):
        self.status_code = status_code
        self.message = message
        self.request = httpx.Request(method="POST", url=" https://openai.api.com/v1/")
        self.response = httpx.Response(status_code=status_code, request=self.request)
        super().__init__(
            self.message
        )  # Call the base class constructor with the parameters it needs
 ############################################################
 def print_verbose(
    print_statement,
@ -405,7 +396,6 @@ def function_setup(
            # Pop the async items from input_callback in reverse order to avoid index issues
            for index in reversed(removed_async_items):
                litellm.input_callback.pop(index)
        if len(litellm.success_callback) > 0:
            removed_async_items = []
            for index, callback in enumerate(litellm.success_callback):  # type: ignore
@ -417,9 +407,9 @@ def function_setup(
                    # we only support async dynamo db logging for acompletion/aembedding since that's used on proxy
                    litellm._async_success_callback.append(callback)
                    removed_async_items.append(index)
-                elif callback == "langsmith":
+                elif callback in litellm._known_custom_logger_compatible_callbacks:
                    callback_class = litellm.litellm_core_utils.litellm_logging._init_custom_logger_compatible_class(  # type: ignore
-                        callback, internal_usage_cache=None, llm_router=None
+                        callback, internal_usage_cache=None, llm_router=None  # type: ignore
                    )
                    # don't double add a callback
@ -3088,6 +3078,15 @@ def get_optional_params(
            non_default_params=non_default_params,
            optional_params=optional_params,
        )
    elif custom_llm_provider == "vertex_ai" and model in litellm.vertex_llama3_models:
        supported_params = get_supported_openai_params(
            model=model, custom_llm_provider=custom_llm_provider
        )
        _check_valid_arg(supported_params=supported_params)
        optional_params = litellm.VertexAILlama3Config().map_openai_params(
            non_default_params=non_default_params,
            optional_params=optional_params,
        )
    elif custom_llm_provider == "sagemaker":
        ## check if unsupported param passed in
        supported_params = get_supported_openai_params(
@ -4189,6 +4188,9 @@ def get_supported_openai_params(
        return litellm.GoogleAIStudioGeminiConfig().get_supported_openai_params()
    elif custom_llm_provider == "vertex_ai":
        if request_type == "chat_completion":
            if model.startswith("meta/"):
                return litellm.VertexAILlama3Config().get_supported_openai_params()
            return litellm.VertexAIConfig().get_supported_openai_params()
        elif request_type == "embeddings":
            return litellm.VertexAITextEmbeddingConfig().get_supported_openai_params()
@ -4484,7 +4486,11 @@ def get_llm_provider(
                    or get_secret("TOGETHER_AI_TOKEN")
                )
            elif custom_llm_provider == "friendliai":
-                api_base = "https://inference.friendli.ai/v1"
+                api_base = (
                    api_base
                    or get_secret("FRIENDLI_API_BASE")
                    or "https://inference.friendli.ai/v1"
                )
                dynamic_api_key = (
                    api_key
                    or get_secret("FRIENDLIAI_API_KEY")
@ -5678,6 +5684,14 @@ def convert_to_model_response_object(
    _response_headers: Optional[dict] = None,
 ):
    received_args = locals()
    if _response_headers is not None:
        llm_response_headers = {
            "{}-{}".format("llm_provider", k): v for k, v in _response_headers.items()
        }
        if hidden_params is not None:
            hidden_params["additional_headers"] = llm_response_headers
        else:
            hidden_params = {"additional_headers": llm_response_headers}
    ### CHECK IF ERROR IN RESPONSE ### - openrouter returns these in the dictionary
    if (
        response_object is not None
@ -5744,10 +5758,12 @@ def convert_to_model_response_object(
                model_response_object.usage.total_tokens = response_object["usage"].get("total_tokens", 0)  # type: ignore
            if "created" in response_object:
-                model_response_object.created = response_object["created"]
+                model_response_object.created = response_object["created"] or int(
                    time.time()
                )
            if "id" in response_object:
-                model_response_object.id = response_object["id"]
+                model_response_object.id = response_object["id"] or str(uuid.uuid4())
            if "system_fingerprint" in response_object:
                model_response_object.system_fingerprint = response_object[
@ -8312,8 +8328,13 @@ class CustomStreamWrapper:
            or {}
        )
        self._hidden_params = {
-            "model_id": (_model_info.get("id", None))
+            "model_id": (_model_info.get("id", None)),
        }  # returned as x-litellm-model-id response header in proxy
        if _response_headers is not None:
            self._hidden_params["additional_headers"] = {
                "{}-{}".format("llm_provider", k): v
                for k, v in _response_headers.items()
            }
        self._response_headers = _response_headers
        self.response_id = None
        self.logging_loop = None
@ -8808,11 +8829,14 @@ class CustomStreamWrapper:
                                str_line.choices[0].content_filter_result
                            )
                        else:
-                            error_message = "Azure Response={}".format(
+                            error_message = "{} Response={}".format(
-                                str(dict(str_line))
+                                self.custom_llm_provider, str(dict(str_line))
                            )
-                        raise litellm.AzureOpenAIError(
+
-                            status_code=400, message=error_message
+                        raise litellm.ContentPolicyViolationError(
                            message=error_message,
                            llm_provider=self.custom_llm_provider,
                            model=self.model,
                        )
                # checking for logprobs
@ -9094,6 +9118,42 @@ class CustomStreamWrapper:
        except Exception as e:
            raise e
    def handle_triton_stream(self, chunk):
        try:
            if isinstance(chunk, dict):
                parsed_response = chunk
            elif isinstance(chunk, (str, bytes)):
                if isinstance(chunk, bytes):
                    chunk = chunk.decode("utf-8")
                if "text_output" in chunk:
                    response = chunk.replace("data: ", "").strip()
                    parsed_response = json.loads(response)
                else:
                    return {
                        "text": "",
                        "is_finished": False,
                        "prompt_tokens": 0,
                        "completion_tokens": 0,
                    }
            else:
                print_verbose(f"chunk: {chunk} (Type: {type(chunk)})")
                raise ValueError(
                    f"Unable to parse response. Original response: {chunk}"
                )
            text = parsed_response.get("text_output", "")
            finish_reason = parsed_response.get("stop_reason")
            is_finished = parsed_response.get("is_finished", False)
            return {
                "text": text,
                "is_finished": is_finished,
                "finish_reason": finish_reason,
                "prompt_tokens": parsed_response.get("input_token_count", 0),
                "completion_tokens": parsed_response.get("generated_token_count", 0),
            }
            return {"text": "", "is_finished": False}
        except Exception as e:
            raise e
    def handle_clarifai_completion_chunk(self, chunk):
        try:
            if isinstance(chunk, dict):
@ -9513,6 +9573,12 @@ class CustomStreamWrapper:
                completion_obj["content"] = response_obj["text"]
                if response_obj["is_finished"]:
                    self.received_finish_reason = response_obj["finish_reason"]
            elif self.custom_llm_provider == "triton":
                response_obj = self.handle_triton_stream(chunk)
                completion_obj["content"] = response_obj["text"]
                print_verbose(f"completion obj content: {completion_obj['content']}")
                if response_obj["is_finished"]:
                    self.received_finish_reason = response_obj["finish_reason"]
            elif self.custom_llm_provider == "text-completion-openai":
                response_obj = self.handle_openai_text_completion_chunk(chunk)
                completion_obj["content"] = response_obj["text"]
@ -10068,6 +10134,7 @@ class CustomStreamWrapper:
                or self.custom_llm_provider == "predibase"
                or self.custom_llm_provider == "databricks"
                or self.custom_llm_provider == "bedrock"
                or self.custom_llm_provider == "triton"
                or self.custom_llm_provider == "watsonx"
                or self.custom_llm_provider in litellm.openai_compatible_endpoints
            ):
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -760,6 +760,36 @@
        "litellm_provider": "azure_ai",
        "mode": "chat"
    },
    "azure_ai/Meta-Llama-31-8B-Instruct": {
        "max_tokens": 128000,
        "max_input_tokens": 128000,
        "max_output_tokens": 128000,
        "input_cost_per_token": 0.0000003,
        "output_cost_per_token": 0.00000061,
        "litellm_provider": "azure_ai",
        "mode": "chat",
        "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-8b-instruct-offer?tab=PlansAndPrice"
    },
    "azure_ai/Meta-Llama-31-70B-Instruct": {
        "max_tokens": 128000,
        "max_input_tokens": 128000,
        "max_output_tokens": 128000,
        "input_cost_per_token": 0.00000268,
        "output_cost_per_token": 0.00000354,
        "litellm_provider": "azure_ai",
        "mode": "chat",
        "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-70b-instruct-offer?tab=PlansAndPrice"
    },
    "azure_ai/Meta-Llama-31-405B-Instruct": {
        "max_tokens": 128000,
        "max_input_tokens": 128000,
        "max_output_tokens": 128000,
        "input_cost_per_token": 0.00000533,
        "output_cost_per_token": 0.000016,
        "litellm_provider": "azure_ai",
        "mode": "chat",
        "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-405b-instruct-offer?tab=PlansAndPrice"
    },
    "babbage-002": {
        "max_tokens": 16384,
        "max_input_tokens": 16384,
@ -1948,6 +1978,16 @@
        "supports_function_calling": true,
        "supports_vision": true
    },
    "vertex_ai/meta/llama3-405b-instruct-maas": {
        "max_tokens": 32000,
        "max_input_tokens": 32000,
        "max_output_tokens": 32000,
        "input_cost_per_token": 0.0,
        "output_cost_per_token": 0.0,
        "litellm_provider": "vertex_ai-llama_models",
        "mode": "chat",
        "source": "https://cloud.google.com/vertex-ai/generative-ai/pricing#partner-models"
    },
    "vertex_ai/imagegeneration@006": {
        "cost_per_image": 0.020,
        "litellm_provider": "vertex_ai-image-models",
@ -3633,6 +3673,24 @@
        "litellm_provider": "bedrock",
        "mode": "chat"
    },
    "meta.llama3-1-8b-instruct-v1:0": {
        "max_tokens": 128000,
        "max_input_tokens": 128000,
        "max_output_tokens": 2048,
        "input_cost_per_token": 0.0000004,
        "output_cost_per_token": 0.0000006,
        "litellm_provider": "bedrock",
        "mode": "chat"
    },
    "meta.llama3-1-70b-instruct-v1:0": {
        "max_tokens": 128000,
        "max_input_tokens": 128000,
        "max_output_tokens": 2048,
        "input_cost_per_token": 0.00000265,
        "output_cost_per_token": 0.0000035,
        "litellm_provider": "bedrock",
        "mode": "chat"
    },
    "512-x-512/50-steps/stability.stable-diffusion-xl-v0": {
        "max_tokens": 77, 
        "max_input_tokens": 77, 
--- a/prometheus.yml
+++ b/prometheus.yml
@ -0,0 +1,7 @@
 global:
  scrape_interval: 15s
 scrape_configs:
  - job_name: 'litellm'
    static_configs:
      - targets: ['litellm:4000']  # Assuming Litellm exposes metrics at port 4000
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.41.26"
+version = "1.42.0"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@ -91,7 +91,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"
 [tool.commitizen]
-version = "1.41.26"
+version = "1.42.0"
 version_files = [
    "pyproject.toml:^version"
 ]
--- a/schema.prisma
+++ b/schema.prisma
@ -172,7 +172,7 @@ model LiteLLM_Config {
 model LiteLLM_SpendLogs {
  request_id          String @id
  call_type           String
-  api_key             String  @default ("")
+  api_key             String  @default ("") // Hashed API Token. Not the actual Virtual Key. Equivalent to 'token' column in LiteLLM_VerificationToken
  spend               Float    @default(0.0)
  total_tokens        Int     @default(0)
  prompt_tokens       Int     @default(0)
@ -183,12 +183,12 @@ model LiteLLM_SpendLogs {
  model               String   @default("")
  model_id            String?   @default("") // the model id stored in proxy model db
  model_group         String?   @default("") // public model_name / model_group
-  api_base            String   @default("")
+  api_base            String?   @default("")
-  user                String   @default("")
+  user                String?   @default("")
-  metadata            Json     @default("{}")
+  metadata            Json?     @default("{}")
-  cache_hit           String   @default("")
+  cache_hit           String?   @default("")
-  cache_key           String   @default("")
+  cache_key           String?   @default("")
-  request_tags        Json     @default("[]")
+  request_tags        Json?     @default("[]")
  team_id             String? 
  end_user            String?
  requester_ip_address String?
`@ -1,4 +1,4 @@`
	`# 🧠 Helicone - OSS LLM Observability Platform`	`# 🧊 Helicone - OSS LLM Observability Platform`

	`:::tip`	`:::tip`