Merge branch 'main' into litellm_parallel_requests

2024-07-24 19:25:56 -07:00 · 2024-07-24 19:25:56 -07:00 · e6963217ba
commit e6963217ba
parent 9d10881f3d 0ac7736b1f
79 changed files with 3913 additions and 180 deletions
--- a/README.md
+++ b/README.md
@ -8,7 +8,7 @@
          <img src="https://railway.app/button.svg" alt="Deploy on Railway">
        </a>
        </p>
-        <p align="center">Call all LLM APIs using the OpenAI format [Bedrock, Huggingface, VertexAI, TogetherAI, Azure, OpenAI, etc.]
+        <p align="center">Call all LLM APIs using the OpenAI format [Bedrock, Huggingface, VertexAI, TogetherAI, Azure, OpenAI, Groq etc.]
        <br>
    </p>
 <h4 align="center"><a href="https://docs.litellm.ai/docs/simple_proxy" target="_blank">OpenAI Proxy Server</a> | <a href="https://docs.litellm.ai/docs/hosted" target="_blank"> Hosted Proxy (Preview)</a> | <a href="https://docs.litellm.ai/docs/enterprise"target="_blank">Enterprise Tier</a></h4>
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -9,13 +9,11 @@ services:
    #########################################
    ## Uncomment these lines to start proxy with a config.yaml file ##
    # volumes:
-    #   - ./proxy_server_config.yaml:/app/config.yaml
-    # command: [ "--config", "./config.yaml", "--port", "4000"]
    ###############################################
    ports:
      - "4000:4000" # Map the container port to the host, change the host port if necessary
    environment:
-        DATABASE_URL: "postgresql://postgres:example@db:5432/postgres"
+        DATABASE_URL: "postgresql://llmproxy:dbpassword9090@db:5432/litellm"
        STORE_MODEL_IN_DB: "True" # allows adding models to proxy via UI
    env_file:
      - .env # Load local .env file
@ -25,11 +23,31 @@ services:
    image: postgres
    restart: always
    environment:
-      POSTGRES_PASSWORD: example
+      POSTGRES_DB: litellm
+      POSTGRES_USER: llmproxy
+      POSTGRES_PASSWORD: dbpassword9090
    healthcheck:
-      test: ["CMD-SHELL", "pg_isready"]
+      test: ["CMD-SHELL", "pg_isready -d litellm -U llmproxy"]
      interval: 1s
      timeout: 5s
      retries: 10
  
+  prometheus:
+    image: prom/prometheus
+    volumes:
+      - prometheus_data:/prometheus
+      - ./prometheus.yml:/etc/prometheus/prometheus.yml
+    ports:
+      - "9090:9090"
+    command:
+      - '--config.file=/etc/prometheus/prometheus.yml'
+      - '--storage.tsdb.path=/prometheus'
+      - '--storage.tsdb.retention.time=15d'
+    restart: always
+
+volumes:
+  prometheus_data:
+    driver: local
+
+
 # ...rest of your docker-compose config if any
--- a/docs/my-website/docs/observability/arize_integration.md
+++ b/docs/my-website/docs/observability/arize_integration.md
@ -0,0 +1,72 @@
+import Image from '@theme/IdealImage';
+
+# 🔥 Arize AI - Logging LLM Input/Output
+
+AI Observability and Evaluation Platform
+
+:::tip
+
+This is community maintained, Please make an issue if you run into a bug
+https://github.com/BerriAI/litellm
+
+:::
+
+
+
+## Pre-Requisites
+Make an account on [Arize AI](https://app.arize.com/auth/login)
+
+## Quick Start
+Use just 2 lines of code, to instantly log your responses **across all providers** with arize
+
+
+```python
+litellm.callbacks = ["arize"]
+```
+```python
+import litellm
+import os
+
+os.environ["ARIZE_SPACE_KEY"] = ""
+os.environ["ARIZE_API_KEY"] = "" # defaults to litellm-completion
+
+# LLM API Keys
+os.environ['OPENAI_API_KEY']=""
+
+# set arize as a callback, litellm will send the data to arize
+litellm.callbacks = ["arize"]
+ 
+# openai call
+response = litellm.completion(
+  model="gpt-3.5-turbo",
+  messages=[
+    {"role": "user", "content": "Hi 👋 - i'm openai"}
+  ]
+)
+```
+
+### Using with LiteLLM Proxy
+
+
+```yaml
+model_list:
+  - model_name: gpt-4
+    litellm_params:
+      model: openai/fake
+      api_key: fake-key
+      api_base: https://exampleopenaiendpoint-production.up.railway.app/
+
+litellm_settings:
+  callbacks: ["arize"]
+
+environment_variables:
+    ARIZE_SPACE_KEY: "d0*****"
+    ARIZE_API_KEY: "141a****"
+```
+
+## Support & Talk to Founders
+
+- [Schedule Demo 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version)
+- [Community Discord 💭](https://discord.gg/wuPM9dRgDw)
+- Our numbers 📞 +1 (770) 8783-106 / ‭+1 (412) 618-6238‬
+- Our emails ✉️ ishaan@berri.ai / krrish@berri.ai
--- a/docs/my-website/docs/observability/braintrust.md
+++ b/docs/my-website/docs/observability/braintrust.md
@ -0,0 +1,147 @@
+import Image from '@theme/IdealImage';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# ⚡️ Braintrust - Evals + Logging 
+
+[Braintrust](https://www.braintrust.dev/) manages evaluations, logging, prompt playground, to data management for AI products.
+
+
+## Quick Start
+
+```python
+# pip install langfuse 
+import litellm
+import os
+
+# set env 
+os.environ["BRAINTRUST_API_KEY"] = "" 
+os.environ['OPENAI_API_KEY']=""
+
+# set braintrust as a callback, litellm will send the data to braintrust
+litellm.callbacks = ["braintrust"] 
+ 
+# openai call
+response = litellm.completion(
+  model="gpt-3.5-turbo",
+  messages=[
+    {"role": "user", "content": "Hi 👋 - i'm openai"}
+  ]
+)
+```
+
+
+
+## OpenAI Proxy Usage
+
+1. Add keys to env 
+```env
+BRAINTRUST_API_KEY="" 
+```
+
+2. Add braintrust to callbacks 
+```yaml
+model_list:
+  - model_name: gpt-3.5-turbo
+    litellm_params:
+      model: gpt-3.5-turbo
+      api_key: os.environ/OPENAI_API_KEY
+
+
+litellm_settings:
+  callbacks: ["braintrust"]
+```
+
+3. Test it! 
+
+```bash
+curl -X POST 'http://0.0.0.0:4000/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-D '{
+    "model": "groq-llama3",
+    "messages": [
+        { "role": "system", "content": "Use your tools smartly"},
+        { "role": "user", "content": "What time is it now? Use your tool"}
+    ]
+}'
+```
+
+## Advanced - pass Project ID 
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+response = litellm.completion(
+  model="gpt-3.5-turbo",
+  messages=[
+    {"role": "user", "content": "Hi 👋 - i'm openai"}
+  ], 
+  metadata={
+    "project_id": "my-special-project" 
+  }
+)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+**Curl**
+
+```bash
+curl -X POST 'http://0.0.0.0:4000/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-D '{
+    "model": "groq-llama3",
+    "messages": [
+        { "role": "system", "content": "Use your tools smartly"},
+        { "role": "user", "content": "What time is it now? Use your tool"}
+    ],
+    "metadata": {
+        "project_id": "my-special-project"
+    }
+}'
+```
+
+**OpenAI SDK**
+
+```python
+import openai
+client = openai.OpenAI(
+    api_key="anything",
+    base_url="http://0.0.0.0:4000"
+)
+
+# request sent to model set on litellm proxy, `litellm --model`
+response = client.chat.completions.create(
+    model="gpt-3.5-turbo",
+    messages = [
+        {
+            "role": "user",
+            "content": "this is a test request, write a short poem"
+        }
+    ],
+    extra_body={ # pass in any provider-specific param, if not supported by openai, https://docs.litellm.ai/docs/completion/input#provider-specific-params
+        "metadata": { # 👈 use for logging additional params (e.g. to langfuse)
+            "project_id": "my-special-project"
+        }
+    }
+)
+
+print(response)
+```
+
+For more examples, [**Click Here**](../proxy/user_keys.md#chatcompletions)
+
+</TabItem>
+</Tabs>
+
+## Full API Spec 
+
+Here's everything you can pass in metadata for a braintrust request 
+
+`braintrust_*` - any metadata field starting with `braintrust_` will be passed as metadata to the logging request 
+
+`project_id`  - set the project id for a braintrust call. Default is `litellm`. 
--- a/docs/my-website/docs/observability/helicone_integration.md
+++ b/docs/my-website/docs/observability/helicone_integration.md
@ -1,4 +1,4 @@
-# 🧠 Helicone - OSS LLM Observability Platform
+# 🧊 Helicone - OSS LLM Observability Platform

 :::tip

--- a/docs/my-website/docs/observability/langsmith_integration.md
+++ b/docs/my-website/docs/observability/langsmith_integration.md
@ -1,6 +1,6 @@
 import Image from '@theme/IdealImage';

-# Langsmith - Logging LLM Input/Output
+# 🦜 Langsmith - Logging LLM Input/Output


 :::tip
@ -56,7 +56,7 @@ response = litellm.completion(
 ```

 ## Advanced
-### Set Custom Project & Run names
+### Set Langsmith fields - Custom Projec, Run names, tags

 ```python
 import litellm
@ -77,6 +77,7 @@ response = litellm.completion(
    metadata={
        "run_name": "litellmRUN",               # langsmith run name
        "project_name": "litellm-completion",   # langsmith project name
+        "tags": ["model1", "prod-2"]            # tags to log on langsmith
    }
 )
 print(response)
--- a/docs/my-website/docs/observability/raw_request_response.md
+++ b/docs/my-website/docs/observability/raw_request_response.md
@ -1,10 +1,16 @@
 import Image from '@theme/IdealImage';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';

 # Raw Request/Response Logging

+
+## Logging
 See the raw request/response sent by LiteLLM in your logging provider (OTEL/Langfuse/etc.).

-**on SDK**
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
 ```python
 # pip install langfuse 
 import litellm
@ -34,13 +40,85 @@ response = litellm.completion(
 )
 ```

-**on Proxy**
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+

 ```yaml
 litellm_settings:
  log_raw_request_response: True
 ```

+
+</TabItem>
+</Tabs>
+
 **Expected Log**

 <Image img={require('../../img/raw_request_log.png')}/>
+
+
+## Return Raw Response Headers 
+
+Return raw response headers from llm provider. 
+
+Currently only supported for openai. 
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+import litellm
+import os
+
+litellm.return_response_headers = True
+
+## set ENV variables
+os.environ["OPENAI_API_KEY"] = "your-api-key"
+
+response = litellm.completion(
+  model="gpt-3.5-turbo",
+  messages=[{ "content": "Hello, how are you?","role": "user"}]
+)
+
+print(response._hidden_params)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+  - model_name: gpt-3.5-turbo
+    litellm_params:
+      model: gpt-3.5-turbo
+      api_key: os.environ/GROQ_API_KEY
+
+litellm_settings:
+  return_response_headers: true
+```
+
+2. Test it!
+
+```bash
+curl -X POST 'http://0.0.0.0:4000/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-D '{
+    "model": "gpt-3.5-turbo",
+    "messages": [
+        { "role": "system", "content": "Use your tools smartly"},
+        { "role": "user", "content": "What time is it now? Use your tool"}
+    ]
+}'
+```
+</TabItem>
+</Tabs>
+
+
+**Expected Response**
+
+<Image img={require('../../img/raw_response_headers.png')}/>
--- a/docs/my-website/docs/oidc.md
+++ b/docs/my-website/docs/oidc.md
@ -0,0 +1,223 @@
+# OpenID Connect (OIDC)
+LiteLLM supports using OpenID Connect (OIDC) for authentication to upstream services . This allows you to avoid storing sensitive credentials in your configuration files.
+
+
+## OIDC Identity Provider (IdP)
+
+LiteLLM supports the following OIDC identity providers:
+
+| Provider                 | Config Name  | Custom Audiences |
+| -------------------------| ------------ | ---------------- |
+| Google Cloud Run         | `google`     | Yes              |
+| CircleCI v1              | `circleci`   | No               |
+| CircleCI v2              | `circleci_v2`| No               |
+| GitHub Actions           | `github`     | Yes              |
+| Azure Kubernetes Service | `azure`      | No               |
+
+If you would like to use a different OIDC provider, please open an issue on GitHub.
+
+
+## OIDC Connect Relying Party (RP)
+
+LiteLLM supports the following OIDC relying parties / clients:
+
+- Amazon Bedrock
+- Azure OpenAI
+- _(Coming soon) Google Cloud Vertex AI_
+
+
+### Configuring OIDC
+
+Wherever a secret key can be used, OIDC can be used in-place. The general format is:
+
+```
+oidc/config_name_here/audience_here
+```
+
+For providers that do not use the `audience` parameter, you can (and should) omit it:
+
+```
+oidc/config_name_here/
+```
+
+## Examples
+
+### Google Cloud Run -> Amazon Bedrock
+
+```yaml
+model_list:
+  - model_name: claude-3-haiku-20240307
+    litellm_params:
+      model: bedrock/anthropic.claude-3-haiku-20240307-v1:0
+      aws_region_name: us-west-2
+      aws_session_name: "litellm"
+      aws_role_name: "arn:aws:iam::YOUR_THING_HERE:role/litellm-google-demo"
+      aws_web_identity_token: "oidc/google/https://example.com"
+```
+
+### CircleCI v2 -> Amazon Bedrock
+
+```yaml
+model_list:
+  - model_name: command-r
+    litellm_params:
+      model: bedrock/cohere.command-r-v1:0
+      aws_region_name: us-west-2
+      aws_session_name: "my-test-session"
+      aws_role_name: "arn:aws:iam::335785316107:role/litellm-github-unit-tests-circleci"
+      aws_web_identity_token: "oidc/circleci_v2/"
+```
+
+#### Amazon IAM Role Configuration for CircleCI v2 -> Bedrock
+
+The configuration below is only an example. You should adjust the permissions and trust relationship to match your specific use case.
+
+Permissions:
+
+```json
+{
+    "Version": "2012-10-17",
+    "Statement": [
+        {
+            "Sid": "VisualEditor0",
+            "Effect": "Allow",
+            "Action": [
+                "bedrock:InvokeModel",
+                "bedrock:InvokeModelWithResponseStream"
+            ],
+            "Resource": [
+                "arn:aws:bedrock:*::foundation-model/anthropic.claude-3-haiku-20240307-v1:0",
+                "arn:aws:bedrock:*::foundation-model/cohere.command-r-v1:0"
+            ]
+        }
+    ]
+}
+```
+
+See https://docs.aws.amazon.com/bedrock/latest/userguide/security_iam_id-based-policy-examples.html for more examples. 
+
+Trust Relationship:
+
+```json
+{
+    "Version": "2012-10-17",
+    "Statement": [
+        {
+            "Effect": "Allow",
+            "Principal": {
+                "Federated": "arn:aws:iam::335785316107:oidc-provider/oidc.circleci.com/org/c5a99188-154f-4f69-8da2-b442b1bf78dd"
+            },
+            "Action": "sts:AssumeRoleWithWebIdentity",
+            "Condition": {
+                "StringEquals": {
+                    "oidc.circleci.com/org/c5a99188-154f-4f69-8da2-b442b1bf78dd:aud": "c5a99188-154f-4f69-8da2-b442b1bf78dd"
+                },
+                "ForAnyValue:StringLike": {
+                    "oidc.circleci.com/org/c5a99188-154f-4f69-8da2-b442b1bf78dd:sub": [
+                        "org/c5a99188-154f-4f69-8da2-b442b1bf78dd/project/*/user/*/vcs-origin/github.com/BerriAI/litellm/vcs-ref/refs/heads/main",
+                        "org/c5a99188-154f-4f69-8da2-b442b1bf78dd/project/*/user/*/vcs-origin/github.com/BerriAI/litellm/vcs-ref/refs/heads/litellm_*"
+                    ]
+                }
+            }
+        }
+    ]
+}
+```
+
+This trust relationship restricts CircleCI to only assume the role on the main branch and branches that start with `litellm_`.
+
+For CircleCI (v1 and v2), you also need to add your organization's OIDC provider in your AWS IAM settings. See https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_create_for-idp_oidc.html for more information.
+
+:::tip
+
+You should _never_ need to create an IAM user. If you did, you're not using OIDC correctly. You should only be creating a role with permissions and a trust relationship to your OIDC provider.
+
+:::
+
+
+### Google Cloud Run -> Azure OpenAI
+
+```yaml
+model_list:
+  - model_name: gpt-4o-2024-05-13
+    litellm_params:
+      model: azure/gpt-4o-2024-05-13
+      azure_ad_token: "oidc/google/https://example.com"
+      api_version: "2024-06-01"
+      api_base: "https://demo-here.openai.azure.com"
+    model_info:
+      base_model: azure/gpt-4o-2024-05-13
+```
+
+For Azure OpenAI, you need to define `AZURE_CLIENT_ID`, `AZURE_TENANT_ID`, and optionally `AZURE_AUTHORITY_HOST` in your environment.
+
+```bash
+export AZURE_CLIENT_ID="91a43c21-cf21-4f34-9085-331015ea4f91" # Azure AD Application (Client) ID
+export AZURE_TENANT_ID="f3b1cf79-eba8-40c3-8120-cb26aca169c2" # Will be the same across of all your Azure AD applications
+export AZURE_AUTHORITY_HOST="https://login.microsoftonline.com" # 👈 Optional, defaults to "https://login.microsoftonline.com"
+```
+
+:::tip
+
+You can find `AZURE_CLIENT_ID` by visiting `https://login.microsoftonline.com/YOUR_DOMAIN_HERE/v2.0/.well-known/openid-configuration` and looking for the UUID in the `issuer` field.
+
+:::
+
+
+:::tip
+
+Don't set `AZURE_AUTHORITY_HOST` in your environment unless you need to override the default value. This way, if the default value changes in the future, you won't need to update your environment.
+
+:::
+
+
+:::tip
+
+By default, Azure AD applications use the audience `api://AzureADTokenExchange`. We recommend setting the audience to something more specific to your application.
+
+:::
+
+
+#### Azure AD Application Configuration
+
+Unfortunately, Azure is bit more complicated to set up than other OIDC relying parties like AWS. Basically, you have to:
+
+1. Create an Azure application.
+2. Add a federated credential for the OIDC IdP you're using (e.g. Google Cloud Run).
+3. Add the Azure application to resource group that contains the Azure OpenAI resource(s).
+4. Give the Azure application the necessary role to access the Azure OpenAI resource(s).
+
+The custom role below is the recommended minimum permissions for the Azure application to access Azure OpenAI resources. You should adjust the permissions to match your specific use case.
+
+```json
+{
+    "id": "/subscriptions/24ebb700-ec2f-417f-afad-78fe15dcc91f/providers/Microsoft.Authorization/roleDefinitions/baf42808-99ff-466d-b9da-f95bb0422c5f",
+    "properties": {
+        "roleName": "invoke-only",
+        "description": "",
+        "assignableScopes": [
+            "/subscriptions/24ebb700-ec2f-417f-afad-78fe15dcc91f/resourceGroups/your-openai-group-name"
+        ],
+        "permissions": [
+            {
+                "actions": [],
+                "notActions": [],
+                "dataActions": [
+                    "Microsoft.CognitiveServices/accounts/OpenAI/deployments/audio/action",
+                    "Microsoft.CognitiveServices/accounts/OpenAI/deployments/search/action",
+                    "Microsoft.CognitiveServices/accounts/OpenAI/deployments/completions/action",
+                    "Microsoft.CognitiveServices/accounts/OpenAI/deployments/chat/completions/action",
+                    "Microsoft.CognitiveServices/accounts/OpenAI/deployments/extensions/chat/completions/action",
+                    "Microsoft.CognitiveServices/accounts/OpenAI/deployments/embeddings/action",
+                    "Microsoft.CognitiveServices/accounts/OpenAI/images/generations/action"
+                ],
+                "notDataActions": []
+            }
+        ]
+    }
+}
+```
+
+_Note: Your UUIDs will be different._
+
+Please contact us for paid enterprise support if you need help setting up Azure AD applications.
--- a/docs/my-website/docs/providers/anthropic.md
+++ b/docs/my-website/docs/providers/anthropic.md
@ -56,7 +56,7 @@ for chunk in response:
    print(chunk["choices"][0]["delta"]["content"])  # same as openai format
 ```

-## OpenAI Proxy Usage 
+## Usage with LiteLLM Proxy 

 Here's how to call Anthropic with the LiteLLM Proxy Server

@ -69,14 +69,6 @@ export ANTHROPIC_API_KEY="your-api-key"
 ### 2. Start the proxy 

 <Tabs>
-<TabItem value="cli" label="cli">
-
-```bash
-$ litellm --model claude-3-opus-20240229
-
-# Server running on http://0.0.0.0:4000
-```
-</TabItem>
 <TabItem value="config" label="config.yaml">

 ```yaml
@ -91,6 +83,14 @@ model_list:
 litellm --config /path/to/config.yaml
 ```
 </TabItem>
+<TabItem value="cli" label="cli">
+
+```bash
+$ litellm --model claude-3-opus-20240229
+
+# Server running on http://0.0.0.0:4000
+```
+</TabItem>
 </Tabs>

 ### 3. Test it
--- a/docs/my-website/docs/providers/friendliai.md
+++ b/docs/my-website/docs/providers/friendliai.md
@ -0,0 +1,60 @@
+# FriendliAI
+https://suite.friendli.ai/
+
+**We support ALL FriendliAI models, just set `friendliai/` as a prefix when sending completion requests**
+
+## API Key
+```python
+# env variable
+os.environ['FRIENDLI_TOKEN']
+os.environ['FRIENDLI_API_BASE'] # Optional. Set this when using dedicated endpoint.
+```
+
+## Sample Usage
+```python
+from litellm import completion
+import os
+
+os.environ['FRIENDLI_TOKEN'] = ""
+response = completion(
+    model="friendliai/mixtral-8x7b-instruct-v0-1", 
+    messages=[
+       {"role": "user", "content": "hello from litellm"}
+   ],
+)
+print(response)
+```
+
+## Sample Usage - Streaming
+```python
+from litellm import completion
+import os
+
+os.environ['FRIENDLI_TOKEN'] = ""
+response = completion(
+    model="friendliai/mixtral-8x7b-instruct-v0-1", 
+    messages=[
+       {"role": "user", "content": "hello from litellm"}
+   ],
+    stream=True
+)
+
+for chunk in response:
+    print(chunk)
+```
+
+
+## Supported Models
+### Serverless Endpoints
+We support ALL FriendliAI AI models, just set `friendliai/` as a prefix when sending completion requests
+
+| Model Name               | Function Call                                                                                                                                                      |
+|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| mixtral-8x7b-instruct | `completion(model="friendliai/mixtral-8x7b-instruct-v0-1", messages)` | 
+| meta-llama-3-8b-instruct | `completion(model="friendliai/meta-llama-3-8b-instruct", messages)` |
+| meta-llama-3-70b-instruct | `completion(model="friendliai/meta-llama-3-70b-instruct", messages)` |  
+
+### Dedicated Endpoints
+```
+model="friendliai/$ENDPOINT_ID:$ADAPTER_ROUTE"
+```
--- a/docs/my-website/docs/providers/groq.md
+++ b/docs/my-website/docs/providers/groq.md
@ -1,3 +1,6 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
 # Groq
 https://groq.com/

@ -20,7 +23,7 @@ import os

 os.environ['GROQ_API_KEY'] = ""
 response = completion(
-    model="groq/llama2-70b-4096", 
+    model="groq/llama3-8b-8192", 
    messages=[
       {"role": "user", "content": "hello from litellm"}
   ],
@ -35,7 +38,7 @@ import os

 os.environ['GROQ_API_KEY'] = ""
 response = completion(
-    model="groq/llama2-70b-4096", 
+    model="groq/llama3-8b-8192", 
    messages=[
       {"role": "user", "content": "hello from litellm"}
   ],
@ -47,6 +50,101 @@ for chunk in response:
 ```


+
+## Usage with LiteLLM Proxy 
+
+### 1. Set Groq Models on config.yaml
+
+```yaml
+model_list:
+  - model_name: groq-llama3-8b-8192 # Model Alias to use for requests
+    litellm_params:
+      model: groq/llama3-8b-8192
+      api_key: "os.environ/GROQ_API_KEY" # ensure you have `GROQ_API_KEY` in your .env
+```
+
+### 2. Start Proxy 
+
+```
+litellm --config config.yaml
+```
+
+### 3. Test it
+
+Make request to litellm proxy
+
+<Tabs>
+<TabItem value="Curl" label="Curl Request">
+
+```shell
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+--header 'Content-Type: application/json' \
+--data ' {
+      "model": "groq-llama3-8b-8192",
+      "messages": [
+        {
+          "role": "user",
+          "content": "what llm are you"
+        }
+      ]
+    }
+'
+```
+</TabItem>
+<TabItem value="openai" label="OpenAI v1.0.0+">
+
+```python
+import openai
+client = openai.OpenAI(
+    api_key="anything",
+    base_url="http://0.0.0.0:4000"
+)
+
+response = client.chat.completions.create(model="groq-llama3-8b-8192", messages = [
+    {
+        "role": "user",
+        "content": "this is a test request, write a short poem"
+    }
+])
+
+print(response)
+
+```
+</TabItem>
+<TabItem value="langchain" label="Langchain">
+
+```python
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts.chat import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+    SystemMessagePromptTemplate,
+)
+from langchain.schema import HumanMessage, SystemMessage
+
+chat = ChatOpenAI(
+    openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
+    model = "groq-llama3-8b-8192",
+    temperature=0.1
+)
+
+messages = [
+    SystemMessage(
+        content="You are a helpful assistant that im using to make a test request to."
+    ),
+    HumanMessage(
+        content="test from litellm. tell me why it's amazing in 1 sentence"
+    ),
+]
+response = chat(messages)
+
+print(response)
+```
+</TabItem>
+</Tabs>
+
+
+
 ## Supported Models - ALL Groq Models Supported!
 We support ALL Groq models, just set `groq/` as a prefix when sending completion requests

@ -114,7 +212,7 @@ tools = [
    }
 ]
 response = litellm.completion(
-    model="groq/llama2-70b-4096",
+    model="groq/llama3-8b-8192",
    messages=messages,
    tools=tools,
    tool_choice="auto",  # auto is default, but we'll be explicit
@ -154,7 +252,7 @@ if tool_calls:
        )  # extend conversation with function response
    print(f"messages: {messages}")
    second_response = litellm.completion(
-        model="groq/llama2-70b-4096", messages=messages
+        model="groq/llama3-8b-8192", messages=messages
    )  # get a new response from the model where it can see the function response
    print("second response\n", second_response)
 ```
--- a/docs/my-website/docs/providers/vertex.md
+++ b/docs/my-website/docs/providers/vertex.md
@ -749,6 +749,85 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
 </TabItem>
 </Tabs>

+
+## Llama 3 API
+ 
+| Model Name       | Function Call                        |
+|------------------|--------------------------------------|
+| meta/llama3-405b-instruct-maas   | `completion('vertex_ai/meta/llama3-405b-instruct-maas', messages)` |
+
+### Usage
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+import os
+
+os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = ""
+
+model = "meta/llama3-405b-instruct-maas"
+
+vertex_ai_project = "your-vertex-project" # can also set this as os.environ["VERTEXAI_PROJECT"]
+vertex_ai_location = "your-vertex-location" # can also set this as os.environ["VERTEXAI_LOCATION"]
+
+response = completion(
+    model="vertex_ai/" + model,
+    messages=[{"role": "user", "content": "hi"}],
+    temperature=0.7,
+    vertex_ai_project=vertex_ai_project,
+    vertex_ai_location=vertex_ai_location,
+)
+print("\nModel Response", response)
+```
+</TabItem>
+<TabItem value="proxy" label="Proxy">
+
+**1. Add to config**
+
+```yaml
+model_list:
+    - model_name: anthropic-llama
+      litellm_params:
+        model: vertex_ai/meta/llama3-405b-instruct-maas
+        vertex_ai_project: "my-test-project"
+        vertex_ai_location: "us-east-1"
+    - model_name: anthropic-llama
+      litellm_params:
+        model: vertex_ai/meta/llama3-405b-instruct-maas
+        vertex_ai_project: "my-test-project"
+        vertex_ai_location: "us-west-1"
+```
+
+**2. Start proxy**
+
+```bash
+litellm --config /path/to/config.yaml
+
+# RUNNING at http://0.0.0.0:4000
+```
+
+**3. Test it!**
+
+```bash
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+      --header 'Authorization: Bearer sk-1234' \
+      --header 'Content-Type: application/json' \
+      --data '{
+            "model": "anthropic-llama", # 👈 the 'model_name' in config
+            "messages": [
+                {
+                "role": "user",
+                "content": "what llm are you"
+                }
+            ],
+        }'
+```
+
+</TabItem>
+</Tabs>
+
 ## Model Garden
 | Model Name       | Function Call                        |
 |------------------|--------------------------------------|
--- a/docs/my-website/docs/proxy/alerting.md
+++ b/docs/my-website/docs/proxy/alerting.md
@ -119,8 +119,8 @@ All Possible Alert Types

 ```python
 AlertType = Literal[
-    "llm_exceptions",
-    "llm_too_slow",
+    "llm_exceptions",        # LLM API Exceptions
+    "llm_too_slow",          # LLM Responses slower than alerting_threshold
    "llm_requests_hanging",
    "budget_alerts",
    "db_exceptions",
@ -133,6 +133,61 @@ AlertType = Literal[

 ```

+## Advanced - set specific slack channels per alert type
+
+Use this if you want to set specific channels per alert type
+
+**This allows you to do the following**
+```
+llm_exceptions -> go to slack channel #llm-exceptions
+spend_reports -> go to slack channel #llm-spend-reports
+```
+
+Set `alert_to_webhook_url` on your config.yaml
+
+```yaml
+model_list:
+  - model_name: gpt-4
+    litellm_params:
+      model: openai/fake
+      api_key: fake-key
+      api_base: https://exampleopenaiendpoint-production.up.railway.app/
+
+general_settings: 
+  master_key: sk-1234
+  alerting: ["slack"]
+  alerting_threshold: 0.0001 # (Seconds) set an artifically low threshold for testing alerting
+  alert_to_webhook_url: {
+    "llm_exceptions": "https://hooks.slack.com/services/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",
+    "llm_too_slow": "https://hooks.slack.com/services/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",
+    "llm_requests_hanging": "https://hooks.slack.com/services/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",
+    "budget_alerts": "https://hooks.slack.com/services/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",
+    "db_exceptions": "https://hooks.slack.com/services/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",
+    "daily_reports": "https://hooks.slack.com/services/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",
+    "spend_reports": "https://hooks.slack.com/services/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",
+    "cooldown_deployment": "https://hooks.slack.com/services/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",
+    "new_model_added": "https://hooks.slack.com/services/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",
+    "outage_alerts": "https://hooks.slack.com/services/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",
+  }
+
+litellm_settings:
+  success_callback: ["langfuse"]
+```
+
+Test it - send a valid llm request - expect to see a `llm_too_slow` alert in it's own slack channel
+
+```shell
+curl -i http://localhost:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer sk-1234" \
+  -d '{
+    "model": "gpt-4",
+    "messages": [
+      {"role": "user", "content": "Hello, Claude gm!"}
+    ]
+}'
+```
+

 ## Advanced - Using MS Teams Webhooks

--- a/docs/my-website/docs/proxy/guardrails.md
+++ b/docs/my-website/docs/proxy/guardrails.md
@ -266,6 +266,54 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
 }'
 ```

+## Disable team from turning on/off guardrails
+
+
+### 1. Disable team from modifying guardrails 
+
+```bash
+curl -X POST 'http://0.0.0.0:4000/team/update' \
+-H 'Authorization: Bearer sk-1234' \
+-H 'Content-Type: application/json' \
+-D '{
+    "team_id": "4198d93c-d375-4c83-8d5a-71e7c5473e50",
+    "metadata": {"guardrails": {"modify_guardrails": false}}
+}'
+```
+
+### 2. Try to disable guardrails for a call 
+
+```bash
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+--header 'Content-Type: application/json' \
+--header 'Authorization: Bearer $LITELLM_VIRTUAL_KEY' \
+--data '{
+"model": "gpt-3.5-turbo",
+    "messages": [
+      {
+        "role": "user",
+        "content": "Think of 10 random colors."
+      }
+    ],
+    "metadata": {"guardrails": {"hide_secrets": false}}
+}'
+```
+
+### 3. Get 403 Error
+
+```
+{
+    "error": {
+        "message": {
+            "error": "Your team does not have permission to modify guardrails."
+        },
+        "type": "auth_error",
+        "param": "None",
+        "code": 403
+    }
+}
+```
+
 Expect to NOT see `+1 412-612-9992` in your server logs on your callback. 

 :::info
--- a/docs/my-website/docs/proxy/logging.md
+++ b/docs/my-website/docs/proxy/logging.md
@ -48,6 +48,20 @@ A number of these headers could be useful for troubleshooting, but the
 `x-litellm-call-id` is the one that is most useful for tracking a request across
 components in your system, including in logging tools.

+## Redacting UserAPIKeyInfo 
+
+Redact information about the user api key (hashed token, user_id, team id, etc.), from logs. 
+
+Currently supported for Langfuse, OpenTelemetry, Logfire, ArizeAI logging.
+
+```yaml
+litellm_settings: 
+  callbacks: ["langfuse"]
+  redact_user_api_key_info: true
+```
+
+Removes any field with `user_api_key_*` from metadata.
+
 ## Logging Proxy Input/Output - Langfuse

 We will use the `--config` to set `litellm.success_callback = ["langfuse"]` this will log all successfull LLM calls to langfuse. Make sure to set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` in your environment
@ -202,6 +216,9 @@ print(response)

 ### Team based Logging to Langfuse

+[👉 Tutorial - Allow each team to use their own Langfuse Project / custom callbacks](team_logging)
+<!-- 
+
 **Example:**

 This config would send langfuse logs to 2 different langfuse projects, based on the team id 
@ -228,7 +245,7 @@ curl -X POST 'http://0.0.0.0:4000/key/generate' \
 -d '{"team_id": "ishaans-secret-project"}'
 ```

-All requests made with these keys will log data to their team-specific logging.
+All requests made with these keys will log data to their team-specific logging. -->

 ### Redacting Messages, Response Content from Langfuse Logging 

@ -1106,6 +1123,52 @@ environment_variables:
 ```


+2. Start Proxy
+
+```
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```bash
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+--header 'Content-Type: application/json' \
+--data ' {
+      "model": "fake-openai-endpoint",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Hello, Claude gm!"
+        }
+      ],
+    }
+'
+```
+Expect to see your log on Langfuse
+<Image img={require('../../img/langsmith_new.png')} />
+
+
+## Logging LLM IO to Arize AI
+
+1. Set `success_callback: ["arize"]` on litellm config.yaml
+
+```yaml
+model_list:
+  - model_name: gpt-4
+    litellm_params:
+      model: openai/fake
+      api_key: fake-key
+      api_base: https://exampleopenaiendpoint-production.up.railway.app/
+
+litellm_settings:
+  callbacks: ["arize"]
+
+environment_variables:
+    ARIZE_SPACE_KEY: "d0*****"
+    ARIZE_API_KEY: "141a****"
+```
+
 2. Start Proxy

 ```
--- a/docs/my-website/docs/proxy/team_based_routing.md
+++ b/docs/my-website/docs/proxy/team_based_routing.md
@ -70,3 +70,42 @@ curl --location 'http://0.0.0.0:4000/v1/chat/completions' \
  "user": "usha"
 }'
 ```
+
+## Team Based Logging
+
+[👉 Tutorial - Allow each team to use their own Langfuse Project / custom callbacks](team_logging.md)
+
+
+
+<!-- 
+## Logging / Caching
+
+Turn on/off logging and caching for a specific team id. 
+
+**Example:**
+
+This config would send langfuse logs to 2 different langfuse projects, based on the team id 
+
+```yaml
+litellm_settings:
+  default_team_settings: 
+    - team_id: my-secret-project
+      success_callback: ["langfuse"]
+      langfuse_public_key: os.environ/LANGFUSE_PUB_KEY_1 # Project 1
+      langfuse_secret: os.environ/LANGFUSE_PRIVATE_KEY_1 # Project 1
+    - team_id: ishaans-secret-project
+      success_callback: ["langfuse"]
+      langfuse_public_key: os.environ/LANGFUSE_PUB_KEY_2 # Project 2
+      langfuse_secret: os.environ/LANGFUSE_SECRET_2 # Project 2
+```
+
+Now, when you [generate keys](./virtual_keys.md) for this team-id 
+
+```bash
+curl -X POST 'http://0.0.0.0:4000/key/generate' \
+-H 'Authorization: Bearer sk-1234' \
+-H 'Content-Type: application/json' \
+-d '{"team_id": "ishaans-secret-project"}'
+```
+
+All requests made with these keys will log data to their team-specific logging. -->
--- a/docs/my-website/docs/proxy/team_logging.md
+++ b/docs/my-website/docs/proxy/team_logging.md
@ -0,0 +1,144 @@
+import Image from '@theme/IdealImage';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# 👥📊 Team Based Logging
+
+Allow each team to use their own Langfuse Project / custom callbacks
+
+**This allows you to do the following**
+```
+Team 1 -> Logs to Langfuse Project 1 
+Team 2 -> Logs to Langfuse Project 2
+Team 3 -> Disabled Logging (for GDPR compliance)
+```
+
+## Set Callbacks Per Team
+
+### 1. Set callback for team 
+
+We make a request to `POST /team/{team_id}/callback` to add a callback for
+
+```shell
+curl -X POST 'http:/localhost:4000/team/dbe2f686-a686-4896-864a-4c3924458709/callback' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+  "callback_name": "langfuse",
+  "callback_type": "success",
+  "callback_vars": {
+    "langfuse_public_key": "pk", 
+    "langfuse_secret_key": "sk_", 
+    "langfuse_host": "https://cloud.langfuse.com"
+    }
+  
+}'
+```
+
+#### Supported Values
+
+| Field | Supported Values | Notes |
+|-------|------------------|-------|
+| `callback_name` | `"langfuse"` | Currently only supports "langfuse" |
+| `callback_type` | `"success"`, `"failure"`, `"success_and_failure"` | |
+| `callback_vars` | | dict of callback settings |
+| &nbsp;&nbsp;&nbsp;&nbsp;`langfuse_public_key` | string | Required |
+| &nbsp;&nbsp;&nbsp;&nbsp;`langfuse_secret_key` | string | Required |
+| &nbsp;&nbsp;&nbsp;&nbsp;`langfuse_host` | string | Optional (defaults to https://cloud.langfuse.com) |
+
+### 2. Create key for team
+
+All keys created for team `dbe2f686-a686-4896-864a-4c3924458709` will log to langfuse project specified on [Step 1. Set callback for team](#1-set-callback-for-team)
+
+
+```shell
+curl --location 'http://0.0.0.0:4000/key/generate' \
+    --header 'Authorization: Bearer sk-1234' \
+    --header 'Content-Type: application/json' \
+    --data '{
+        "team_id": "dbe2f686-a686-4896-864a-4c3924458709"
+}'
+```
+
+
+### 3. Make `/chat/completion` request for team
+
+```shell
+curl -i http://localhost:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer sk-KbUuE0WNptC0jXapyMmLBA" \
+  -d '{
+    "model": "gpt-4",
+    "messages": [
+      {"role": "user", "content": "Hello, Claude gm!"}
+    ]
+}'
+```
+
+Expect this to be logged on the langfuse project specified on [Step 1. Set callback for team](#1-set-callback-for-team)
+
+
+## Disable Logging for a Team
+
+To disable logging for a specific team, you can use the following endpoint:
+
+`POST /team/{team_id}/disable_logging`
+
+This endpoint removes all success and failure callbacks for the specified team, effectively disabling logging.
+
+### Step 1. Disable logging for team
+
+```shell
+curl -X POST 'http://localhost:4000/team/YOUR_TEAM_ID/disable_logging' \
+    -H 'Authorization: Bearer YOUR_API_KEY'
+```
+Replace YOUR_TEAM_ID with the actual team ID
+
+**Response**
+A successful request will return a response similar to this:
+```json
+{
+    "status": "success",
+    "message": "Logging disabled for team YOUR_TEAM_ID",
+    "data": {
+        "team_id": "YOUR_TEAM_ID",
+        "success_callbacks": [],
+        "failure_callbacks": []
+    }
+}
+```
+
+### Step 2. Test it - `/chat/completions`
+
+Use a key generated for team = `team_id` - you should see no logs on your configured success callback (eg. Langfuse)
+
+```shell
+curl -i http://localhost:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer sk-KbUuE0WNptC0jXapyMmLBA" \
+  -d '{
+    "model": "gpt-4",
+    "messages": [
+      {"role": "user", "content": "Hello, Claude gm!"}
+    ]
+}'
+```
+
+### Debugging / Troubleshooting
+
+- Check active callbacks for team using `GET /team/{team_id}/callback`
+
+Use this to check what success/failure callbacks are active for team=`team_id`
+
+```shell
+curl -X GET 'http://localhost:4000/team/dbe2f686-a686-4896-864a-4c3924458709/callback' \
+        -H 'Authorization: Bearer sk-1234'
+```
+
+## Team Logging Endpoints
+
+- [`POST /team/{team_id}/callback` Add a success/failure callback to a team](https://litellm-api.up.railway.app/#/team%20management/add_team_callbacks_team__team_id__callback_post)
+- [`GET /team/{team_id}/callback` - Get the success/failure callbacks and variables for a team](https://litellm-api.up.railway.app/#/team%20management/get_team_callbacks_team__team_id__callback_get)
+
+
+
--- a/docs/my-website/img/raw_response_headers.png
+++ b/docs/my-website/img/raw_response_headers.png
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -44,19 +44,20 @@ const sidebars = {
        "proxy/cost_tracking",
        "proxy/self_serve",
        "proxy/virtual_keys",
-        "proxy/tag_routing",
-        "proxy/users",
-        "proxy/team_budgets",
-        "proxy/customers",
-        "proxy/billing",
-        "proxy/guardrails",
-        "proxy/token_auth",
-        "proxy/alerting",
        {
          type: "category",
          label: "🪢 Logging",
          items: ["proxy/logging", "proxy/streaming_logging"],
        },
+        "proxy/team_logging",
+        "proxy/guardrails",
+        "proxy/tag_routing",
+        "proxy/users",
+        "proxy/team_budgets",
+        "proxy/customers",
+        "proxy/billing",
+        "proxy/token_auth",
+        "proxy/alerting",
        "proxy/ui",
        "proxy/prometheus",
        "proxy/pass_through",
@ -157,6 +158,7 @@ const sidebars = {
        "providers/triton-inference-server",
        "providers/ollama", 
        "providers/perplexity", 
+        "providers/friendliai",
        "providers/groq", 
        "providers/deepseek", 
        "providers/fireworks_ai",
@ -183,7 +185,14 @@ const sidebars = {
    "scheduler",
    "set_keys",
    "budget_manager",
-    "secret",
+    {
+      type: "category", 
+      label: "Secret Manager", 
+      items: [
+        "secret", 
+        "oidc"
+      ]
+    },
    "completion/token_usage",
    "load_test",
    {
@ -192,17 +201,19 @@ const sidebars = {
      items: [
        "observability/langfuse_integration",
        "observability/logfire_integration",
+        "observability/langsmith_integration",
+        "observability/arize_integration",
        "debugging/local_debugging",
        "observability/raw_request_response",
        "observability/custom_callback",
        "observability/scrub_data",
-        "observability/helicone_integration",
+        "observability/braintrust",
        "observability/sentry",
        "observability/lago",
+        "observability/helicone_integration",
        "observability/openmeter",
        "observability/promptlayer_integration",
        "observability/wandb_integration",
-        "observability/langsmith_integration",
        "observability/slack_integration",
        "observability/traceloop_integration",
        "observability/athina_integration",
--- a/litellm/init.py
+++ b/litellm/init.py
@ -4,7 +4,7 @@ import warnings
 warnings.filterwarnings("ignore", message=".*conflict with protected namespace.*")
 ### INIT VARIABLES ###
 import threading, requests, os
-from typing import Callable, List, Optional, Dict, Union, Any, Literal
+from typing import Callable, List, Optional, Dict, Union, Any, Literal, get_args
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.caching import Cache
 from litellm._logging import (
@ -38,8 +38,18 @@ success_callback: List[Union[str, Callable]] = []
 failure_callback: List[Union[str, Callable]] = []
 service_callback: List[Union[str, Callable]] = []
 _custom_logger_compatible_callbacks_literal = Literal[
-    "lago", "openmeter", "logfire", "dynamic_rate_limiter", "langsmith", "galileo"
+    "lago",
+    "openmeter",
+    "logfire",
+    "dynamic_rate_limiter",
+    "langsmith",
+    "galileo",
+    "braintrust",
+    "arize",
 ]
+_known_custom_logger_compatible_callbacks: List = list(
+    get_args(_custom_logger_compatible_callbacks_literal)
+)
 callbacks: List[Union[Callable, _custom_logger_compatible_callbacks_literal]] = []
 _langfuse_default_tags: Optional[
    List[
@ -67,6 +77,7 @@ post_call_rules: List[Callable] = []
 turn_off_message_logging: Optional[bool] = False
 log_raw_request_response: bool = False
 redact_messages_in_exceptions: Optional[bool] = False
+redact_user_api_key_info: Optional[bool] = False
 store_audit_logs = False  # Enterprise feature, allow users to see audit logs
 ## end of callbacks #############

@ -346,6 +357,7 @@ vertex_text_models: List = []
 vertex_code_text_models: List = []
 vertex_embedding_models: List = []
 vertex_anthropic_models: List = []
+vertex_llama3_models: List = []
 ai21_models: List = []
 nlp_cloud_models: List = []
 aleph_alpha_models: List = []
@ -388,6 +400,9 @@ for key, value in model_cost.items():
    elif value.get("litellm_provider") == "vertex_ai-anthropic_models":
        key = key.replace("vertex_ai/", "")
        vertex_anthropic_models.append(key)
+    elif value.get("litellm_provider") == "vertex_ai-llama_models":
+        key = key.replace("vertex_ai/", "")
+        vertex_llama3_models.append(key)
    elif value.get("litellm_provider") == "ai21":
        ai21_models.append(key)
    elif value.get("litellm_provider") == "nlp_cloud":
@ -817,6 +832,7 @@ from .llms.petals import PetalsConfig
 from .llms.vertex_httpx import VertexGeminiConfig, GoogleAIStudioGeminiConfig
 from .llms.vertex_ai import VertexAIConfig, VertexAITextEmbeddingConfig
 from .llms.vertex_ai_anthropic import VertexAIAnthropicConfig
+from .llms.vertex_ai_llama import VertexAILlama3Config
 from .llms.sagemaker import SagemakerConfig
 from .llms.ollama import OllamaConfig
 from .llms.ollama_chat import OllamaChatConfig
@ -872,6 +888,7 @@ from .exceptions import (
    APIError,
    Timeout,
    APIConnectionError,
+    UnsupportedParamsError,
    APIResponseValidationError,
    UnprocessableEntityError,
    InternalServerError,
--- a/litellm/exceptions.py
+++ b/litellm/exceptions.py
@ -682,11 +682,39 @@ class JSONSchemaValidationError(APIError):
        )


+class UnsupportedParamsError(BadRequestError):
+    def __init__(
+        self,
+        message,
+        llm_provider: Optional[str] = None,
+        model: Optional[str] = None,
+        status_code: int = 400,
+        response: Optional[httpx.Response] = None,
+        litellm_debug_info: Optional[str] = None,
+        max_retries: Optional[int] = None,
+        num_retries: Optional[int] = None,
+    ):
+        self.status_code = 400
+        self.message = "litellm.UnsupportedParamsError: {}".format(message)
+        self.model = model
+        self.llm_provider = llm_provider
+        self.litellm_debug_info = litellm_debug_info
+        response = response or httpx.Response(
+            status_code=self.status_code,
+            request=httpx.Request(
+                method="GET", url="https://litellm.ai"
+            ),  # mock request object
+        )
+        self.max_retries = max_retries
+        self.num_retries = num_retries
+
+
 LITELLM_EXCEPTION_TYPES = [
    AuthenticationError,
    NotFoundError,
    BadRequestError,
    UnprocessableEntityError,
+    UnsupportedParamsError,
    Timeout,
    PermissionDeniedError,
    RateLimitError,
--- a/litellm/integrations/_types/open_inference.py
+++ b/litellm/integrations/_types/open_inference.py
@ -0,0 +1,286 @@
+from enum import Enum
+
+
+class SpanAttributes:
+    OUTPUT_VALUE = "output.value"
+    OUTPUT_MIME_TYPE = "output.mime_type"
+    """
+    The type of output.value. If unspecified, the type is plain text by default.
+    If type is JSON, the value is a string representing a JSON object.
+    """
+    INPUT_VALUE = "input.value"
+    INPUT_MIME_TYPE = "input.mime_type"
+    """
+    The type of input.value. If unspecified, the type is plain text by default.
+    If type is JSON, the value is a string representing a JSON object.
+    """
+
+    EMBEDDING_EMBEDDINGS = "embedding.embeddings"
+    """
+    A list of objects containing embedding data, including the vector and represented piece of text.
+    """
+    EMBEDDING_MODEL_NAME = "embedding.model_name"
+    """
+    The name of the embedding model.
+    """
+
+    LLM_FUNCTION_CALL = "llm.function_call"
+    """
+    For models and APIs that support function calling. Records attributes such as the function
+    name and arguments to the called function.
+    """
+    LLM_INVOCATION_PARAMETERS = "llm.invocation_parameters"
+    """
+    Invocation parameters passed to the LLM or API, such as the model name, temperature, etc.
+    """
+    LLM_INPUT_MESSAGES = "llm.input_messages"
+    """
+    Messages provided to a chat API.
+    """
+    LLM_OUTPUT_MESSAGES = "llm.output_messages"
+    """
+    Messages received from a chat API.
+    """
+    LLM_MODEL_NAME = "llm.model_name"
+    """
+    The name of the model being used.
+    """
+    LLM_PROMPTS = "llm.prompts"
+    """
+    Prompts provided to a completions API.
+    """
+    LLM_PROMPT_TEMPLATE = "llm.prompt_template.template"
+    """
+    The prompt template as a Python f-string.
+    """
+    LLM_PROMPT_TEMPLATE_VARIABLES = "llm.prompt_template.variables"
+    """
+    A list of input variables to the prompt template.
+    """
+    LLM_PROMPT_TEMPLATE_VERSION = "llm.prompt_template.version"
+    """
+    The version of the prompt template being used.
+    """
+    LLM_TOKEN_COUNT_PROMPT = "llm.token_count.prompt"
+    """
+    Number of tokens in the prompt.
+    """
+    LLM_TOKEN_COUNT_COMPLETION = "llm.token_count.completion"
+    """
+    Number of tokens in the completion.
+    """
+    LLM_TOKEN_COUNT_TOTAL = "llm.token_count.total"
+    """
+    Total number of tokens, including both prompt and completion.
+    """
+
+    TOOL_NAME = "tool.name"
+    """
+    Name of the tool being used.
+    """
+    TOOL_DESCRIPTION = "tool.description"
+    """
+    Description of the tool's purpose, typically used to select the tool.
+    """
+    TOOL_PARAMETERS = "tool.parameters"
+    """
+    Parameters of the tool represented a dictionary JSON string, e.g.
+    see https://platform.openai.com/docs/guides/gpt/function-calling
+    """
+
+    RETRIEVAL_DOCUMENTS = "retrieval.documents"
+
+    METADATA = "metadata"
+    """
+    Metadata attributes are used to store user-defined key-value pairs.
+    For example, LangChain uses metadata to store user-defined attributes for a chain.
+    """
+
+    TAG_TAGS = "tag.tags"
+    """
+    Custom categorical tags for the span.
+    """
+
+    OPENINFERENCE_SPAN_KIND = "openinference.span.kind"
+
+    SESSION_ID = "session.id"
+    """
+    The id of the session
+    """
+    USER_ID = "user.id"
+    """
+    The id of the user
+    """
+
+
+class MessageAttributes:
+    """
+    Attributes for a message sent to or from an LLM
+    """
+
+    MESSAGE_ROLE = "message.role"
+    """
+    The role of the message, such as "user", "agent", "function".
+    """
+    MESSAGE_CONTENT = "message.content"
+    """
+    The content of the message to or from the llm, must be a string.
+    """
+    MESSAGE_CONTENTS = "message.contents"
+    """
+    The message contents to the llm, it is an array of
+    `message_content` prefixed attributes.
+    """
+    MESSAGE_NAME = "message.name"
+    """
+    The name of the message, often used to identify the function
+    that was used to generate the message.
+    """
+    MESSAGE_TOOL_CALLS = "message.tool_calls"
+    """
+    The tool calls generated by the model, such as function calls.
+    """
+    MESSAGE_FUNCTION_CALL_NAME = "message.function_call_name"
+    """
+    The function name that is a part of the message list.
+    This is populated for role 'function' or 'agent' as a mechanism to identify
+    the function that was called during the execution of a tool.
+    """
+    MESSAGE_FUNCTION_CALL_ARGUMENTS_JSON = "message.function_call_arguments_json"
+    """
+    The JSON string representing the arguments passed to the function
+    during a function call.
+    """
+
+
+class MessageContentAttributes:
+    """
+    Attributes for the contents of user messages sent to an LLM.
+    """
+
+    MESSAGE_CONTENT_TYPE = "message_content.type"
+    """
+    The type of the content, such as "text" or "image".
+    """
+    MESSAGE_CONTENT_TEXT = "message_content.text"
+    """
+    The text content of the message, if the type is "text".
+    """
+    MESSAGE_CONTENT_IMAGE = "message_content.image"
+    """
+    The image content of the message, if the type is "image".
+    An image can be made available to the model by passing a link to
+    the image or by passing the base64 encoded image directly in the
+    request.
+    """
+
+
+class ImageAttributes:
+    """
+    Attributes for images
+    """
+
+    IMAGE_URL = "image.url"
+    """
+    An http or base64 image url
+    """
+
+
+class DocumentAttributes:
+    """
+    Attributes for a document.
+    """
+
+    DOCUMENT_ID = "document.id"
+    """
+    The id of the document.
+    """
+    DOCUMENT_SCORE = "document.score"
+    """
+    The score of the document
+    """
+    DOCUMENT_CONTENT = "document.content"
+    """
+    The content of the document.
+    """
+    DOCUMENT_METADATA = "document.metadata"
+    """
+    The metadata of the document represented as a dictionary
+    JSON string, e.g. `"{ 'title': 'foo' }"`
+    """
+
+
+class RerankerAttributes:
+    """
+    Attributes for a reranker
+    """
+
+    RERANKER_INPUT_DOCUMENTS = "reranker.input_documents"
+    """
+    List of documents as input to the reranker
+    """
+    RERANKER_OUTPUT_DOCUMENTS = "reranker.output_documents"
+    """
+    List of documents as output from the reranker
+    """
+    RERANKER_QUERY = "reranker.query"
+    """
+    Query string for the reranker
+    """
+    RERANKER_MODEL_NAME = "reranker.model_name"
+    """
+    Model name of the reranker
+    """
+    RERANKER_TOP_K = "reranker.top_k"
+    """
+    Top K parameter of the reranker
+    """
+
+
+class EmbeddingAttributes:
+    """
+    Attributes for an embedding
+    """
+
+    EMBEDDING_TEXT = "embedding.text"
+    """
+    The text represented by the embedding.
+    """
+    EMBEDDING_VECTOR = "embedding.vector"
+    """
+    The embedding vector.
+    """
+
+
+class ToolCallAttributes:
+    """
+    Attributes for a tool call
+    """
+
+    TOOL_CALL_FUNCTION_NAME = "tool_call.function.name"
+    """
+    The name of function that is being called during a tool call.
+    """
+    TOOL_CALL_FUNCTION_ARGUMENTS_JSON = "tool_call.function.arguments"
+    """
+    The JSON string representing the arguments passed to the function
+    during a tool call.
+    """
+
+
+class OpenInferenceSpanKindValues(Enum):
+    TOOL = "TOOL"
+    CHAIN = "CHAIN"
+    LLM = "LLM"
+    RETRIEVER = "RETRIEVER"
+    EMBEDDING = "EMBEDDING"
+    AGENT = "AGENT"
+    RERANKER = "RERANKER"
+    UNKNOWN = "UNKNOWN"
+    GUARDRAIL = "GUARDRAIL"
+    EVALUATOR = "EVALUATOR"
+
+
+class OpenInferenceMimeTypeValues(Enum):
+    TEXT = "text/plain"
+    JSON = "application/json"
--- a/litellm/integrations/arize_ai.py
+++ b/litellm/integrations/arize_ai.py
@ -0,0 +1,114 @@
+"""
+arize AI is OTEL compatible
+
+this file has Arize ai specific helper functions
+"""
+
+from typing import TYPE_CHECKING, Any, Optional, Union
+
+if TYPE_CHECKING:
+    from opentelemetry.trace import Span as _Span
+
+    Span = _Span
+else:
+    Span = Any
+
+
+def set_arize_ai_attributes(span: Span, kwargs, response_obj):
+    from litellm.integrations._types.open_inference import (
+        MessageAttributes,
+        MessageContentAttributes,
+        OpenInferenceSpanKindValues,
+        SpanAttributes,
+    )
+
+    optional_params = kwargs.get("optional_params", {})
+    litellm_params = kwargs.get("litellm_params", {}) or {}
+
+    #############################################
+    ############ LLM CALL METADATA ##############
+    #############################################
+    # commented out for now - looks like Arize AI could not log this
+    # metadata = litellm_params.get("metadata", {}) or {}
+    # span.set_attribute(SpanAttributes.METADATA, str(metadata))
+
+    #############################################
+    ########## LLM Request Attributes ###########
+    #############################################
+
+    # The name of the LLM a request is being made to
+    if kwargs.get("model"):
+        span.set_attribute(SpanAttributes.LLM_MODEL_NAME, kwargs.get("model"))
+
+    span.set_attribute(
+        SpanAttributes.OPENINFERENCE_SPAN_KIND, OpenInferenceSpanKindValues.LLM.value
+    )
+    messages = kwargs.get("messages")
+
+    # for /chat/completions
+    # https://docs.arize.com/arize/large-language-models/tracing/semantic-conventions
+    if messages:
+        span.set_attribute(
+            SpanAttributes.INPUT_VALUE,
+            messages[-1].get("content", ""),  # get the last message for input
+        )
+
+        # LLM_INPUT_MESSAGES shows up under `input_messages` tab on the span page
+        for idx, msg in enumerate(messages):
+            # Set the role per message
+            span.set_attribute(
+                f"{SpanAttributes.LLM_INPUT_MESSAGES}.{idx}.{MessageAttributes.MESSAGE_ROLE}",
+                msg["role"],
+            )
+            # Set the content per message
+            span.set_attribute(
+                f"{SpanAttributes.LLM_INPUT_MESSAGES}.{idx}.{MessageAttributes.MESSAGE_CONTENT}",
+                msg.get("content", ""),
+            )
+
+    # The Generative AI Provider: Azure, OpenAI, etc.
+    span.set_attribute(SpanAttributes.LLM_INVOCATION_PARAMETERS, str(optional_params))
+
+    if optional_params.get("user"):
+        span.set_attribute(SpanAttributes.USER_ID, optional_params.get("user"))
+
+    #############################################
+    ########## LLM Response Attributes ##########
+    # https://docs.arize.com/arize/large-language-models/tracing/semantic-conventions
+    #############################################
+    for choice in response_obj.get("choices"):
+        response_message = choice.get("message", {})
+        span.set_attribute(
+            SpanAttributes.OUTPUT_VALUE, response_message.get("content", "")
+        )
+
+        # This shows up under `output_messages` tab on the span page
+        # This code assumes a single response
+        span.set_attribute(
+            f"{SpanAttributes.LLM_OUTPUT_MESSAGES}.0.{MessageAttributes.MESSAGE_ROLE}",
+            response_message["role"],
+        )
+        span.set_attribute(
+            f"{SpanAttributes.LLM_OUTPUT_MESSAGES}.0.{MessageAttributes.MESSAGE_CONTENT}",
+            response_message.get("content", ""),
+        )
+
+    usage = response_obj.get("usage")
+    if usage:
+        span.set_attribute(
+            SpanAttributes.LLM_TOKEN_COUNT_TOTAL,
+            usage.get("total_tokens"),
+        )
+
+        # The number of tokens used in the LLM response (completion).
+        span.set_attribute(
+            SpanAttributes.LLM_TOKEN_COUNT_COMPLETION,
+            usage.get("completion_tokens"),
+        )
+
+        # The number of tokens used in the LLM prompt.
+        span.set_attribute(
+            SpanAttributes.LLM_TOKEN_COUNT_PROMPT,
+            usage.get("prompt_tokens"),
+        )
+    pass
--- a/litellm/integrations/braintrust_logging.py
+++ b/litellm/integrations/braintrust_logging.py
@ -0,0 +1,369 @@
+# What is this?
+## Log success + failure events to Braintrust
+
+import copy
+import json
+import os
+import threading
+import traceback
+import uuid
+from typing import Literal, Optional
+
+import dotenv
+import httpx
+
+import litellm
+from litellm import verbose_logger
+from litellm.integrations.custom_logger import CustomLogger
+from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
+from litellm.utils import get_formatted_prompt
+
+global_braintrust_http_handler = AsyncHTTPHandler()
+global_braintrust_sync_http_handler = HTTPHandler()
+API_BASE = "https://api.braintrustdata.com/v1"
+
+
+def get_utc_datetime():
+    import datetime as dt
+    from datetime import datetime
+
+    if hasattr(dt, "UTC"):
+        return datetime.now(dt.UTC)  # type: ignore
+    else:
+        return datetime.utcnow()  # type: ignore
+
+
+class BraintrustLogger(CustomLogger):
+    def __init__(
+        self, api_key: Optional[str] = None, api_base: Optional[str] = None
+    ) -> None:
+        super().__init__()
+        self.validate_environment(api_key=api_key)
+        self.api_base = api_base or API_BASE
+        self.default_project_id = None
+        self.api_key: str = api_key or os.getenv("BRAINTRUST_API_KEY")  # type: ignore
+        self.headers = {
+            "Authorization": "Bearer " + self.api_key,
+            "Content-Type": "application/json",
+        }
+
+    def validate_environment(self, api_key: Optional[str]):
+        """
+        Expects
+        BRAINTRUST_API_KEY
+
+        in the environment
+        """
+        missing_keys = []
+        if api_key is None and os.getenv("BRAINTRUST_API_KEY", None) is None:
+            missing_keys.append("BRAINTRUST_API_KEY")
+
+        if len(missing_keys) > 0:
+            raise Exception("Missing keys={} in environment.".format(missing_keys))
+
+    @staticmethod
+    def add_metadata_from_header(litellm_params: dict, metadata: dict) -> dict:
+        """
+        Adds metadata from proxy request headers to Langfuse logging if keys start with "langfuse_"
+        and overwrites litellm_params.metadata if already included.
+
+        For example if you want to append your trace to an existing `trace_id` via header, send
+        `headers: { ..., langfuse_existing_trace_id: your-existing-trace-id }` via proxy request.
+        """
+        if litellm_params is None:
+            return metadata
+
+        if litellm_params.get("proxy_server_request") is None:
+            return metadata
+
+        if metadata is None:
+            metadata = {}
+
+        proxy_headers = (
+            litellm_params.get("proxy_server_request", {}).get("headers", {}) or {}
+        )
+
+        for metadata_param_key in proxy_headers:
+            if metadata_param_key.startswith("braintrust"):
+                trace_param_key = metadata_param_key.replace("braintrust", "", 1)
+                if trace_param_key in metadata:
+                    verbose_logger.warning(
+                        f"Overwriting Braintrust `{trace_param_key}` from request header"
+                    )
+                else:
+                    verbose_logger.debug(
+                        f"Found Braintrust `{trace_param_key}` in request header"
+                    )
+                metadata[trace_param_key] = proxy_headers.get(metadata_param_key)
+
+        return metadata
+
+    async def create_default_project_and_experiment(self):
+        project = await global_braintrust_http_handler.post(
+            f"{self.api_base}/project", headers=self.headers, json={"name": "litellm"}
+        )
+
+        project_dict = project.json()
+
+        self.default_project_id = project_dict["id"]
+
+    def create_sync_default_project_and_experiment(self):
+        project = global_braintrust_sync_http_handler.post(
+            f"{self.api_base}/project", headers=self.headers, json={"name": "litellm"}
+        )
+
+        project_dict = project.json()
+
+        self.default_project_id = project_dict["id"]
+
+    def log_success_event(self, kwargs, response_obj, start_time, end_time):
+        verbose_logger.debug("REACHES BRAINTRUST SUCCESS")
+        try:
+            litellm_call_id = kwargs.get("litellm_call_id")
+            project_id = kwargs.get("project_id", None)
+            if project_id is None:
+                if self.default_project_id is None:
+                    self.create_sync_default_project_and_experiment()
+                project_id = self.default_project_id
+
+            prompt = {"messages": kwargs.get("messages")}
+
+            if response_obj is not None and (
+                kwargs.get("call_type", None) == "embedding"
+                or isinstance(response_obj, litellm.EmbeddingResponse)
+            ):
+                input = prompt
+                output = None
+            elif response_obj is not None and isinstance(
+                response_obj, litellm.ModelResponse
+            ):
+                input = prompt
+                output = response_obj["choices"][0]["message"].json()
+            elif response_obj is not None and isinstance(
+                response_obj, litellm.TextCompletionResponse
+            ):
+                input = prompt
+                output = response_obj.choices[0].text
+            elif response_obj is not None and isinstance(
+                response_obj, litellm.ImageResponse
+            ):
+                input = prompt
+                output = response_obj["data"]
+
+            litellm_params = kwargs.get("litellm_params", {})
+            metadata = (
+                litellm_params.get("metadata", {}) or {}
+            )  # if litellm_params['metadata'] == None
+            metadata = self.add_metadata_from_header(litellm_params, metadata)
+            clean_metadata = {}
+            try:
+                metadata = copy.deepcopy(
+                    metadata
+                )  # Avoid modifying the original metadata
+            except:
+                new_metadata = {}
+                for key, value in metadata.items():
+                    if (
+                        isinstance(value, list)
+                        or isinstance(value, dict)
+                        or isinstance(value, str)
+                        or isinstance(value, int)
+                        or isinstance(value, float)
+                    ):
+                        new_metadata[key] = copy.deepcopy(value)
+                metadata = new_metadata
+
+            tags = []
+            if isinstance(metadata, dict):
+                for key, value in metadata.items():
+
+                    # generate langfuse tags - Default Tags sent to Langfuse from LiteLLM Proxy
+                    if (
+                        litellm._langfuse_default_tags is not None
+                        and isinstance(litellm._langfuse_default_tags, list)
+                        and key in litellm._langfuse_default_tags
+                    ):
+                        tags.append(f"{key}:{value}")
+
+                    # clean litellm metadata before logging
+                    if key in [
+                        "headers",
+                        "endpoint",
+                        "caching_groups",
+                        "previous_models",
+                    ]:
+                        continue
+                    else:
+                        clean_metadata[key] = value
+
+            cost = kwargs.get("response_cost", None)
+            if cost is not None:
+                clean_metadata["litellm_response_cost"] = cost
+
+            metrics: Optional[dict] = None
+            if (
+                response_obj is not None
+                and hasattr(response_obj, "usage")
+                and isinstance(response_obj.usage, litellm.Usage)
+            ):
+                generation_id = litellm.utils.get_logging_id(start_time, response_obj)
+                metrics = {
+                    "prompt_tokens": response_obj.usage.prompt_tokens,
+                    "completion_tokens": response_obj.usage.completion_tokens,
+                    "total_tokens": response_obj.usage.total_tokens,
+                    "total_cost": cost,
+                }
+
+            request_data = {
+                "id": litellm_call_id,
+                "input": prompt,
+                "output": output,
+                "metadata": clean_metadata,
+                "tags": tags,
+            }
+            if metrics is not None:
+                request_data["metrics"] = metrics
+
+            try:
+                global_braintrust_sync_http_handler.post(
+                    url=f"{self.api_base}/project_logs/{project_id}/insert",
+                    json={"events": [request_data]},
+                    headers=self.headers,
+                )
+            except httpx.HTTPStatusError as e:
+                raise Exception(e.response.text)
+        except Exception as e:
+            verbose_logger.error(
+                "Error logging to braintrust - Exception received - {}\n{}".format(
+                    str(e), traceback.format_exc()
+                )
+            )
+            raise e
+
+    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
+        verbose_logger.debug("REACHES BRAINTRUST SUCCESS")
+        try:
+            litellm_call_id = kwargs.get("litellm_call_id")
+            project_id = kwargs.get("project_id", None)
+            if project_id is None:
+                if self.default_project_id is None:
+                    await self.create_default_project_and_experiment()
+                project_id = self.default_project_id
+
+            prompt = {"messages": kwargs.get("messages")}
+
+            if response_obj is not None and (
+                kwargs.get("call_type", None) == "embedding"
+                or isinstance(response_obj, litellm.EmbeddingResponse)
+            ):
+                input = prompt
+                output = None
+            elif response_obj is not None and isinstance(
+                response_obj, litellm.ModelResponse
+            ):
+                input = prompt
+                output = response_obj["choices"][0]["message"].json()
+            elif response_obj is not None and isinstance(
+                response_obj, litellm.TextCompletionResponse
+            ):
+                input = prompt
+                output = response_obj.choices[0].text
+            elif response_obj is not None and isinstance(
+                response_obj, litellm.ImageResponse
+            ):
+                input = prompt
+                output = response_obj["data"]
+
+            litellm_params = kwargs.get("litellm_params", {})
+            metadata = (
+                litellm_params.get("metadata", {}) or {}
+            )  # if litellm_params['metadata'] == None
+            metadata = self.add_metadata_from_header(litellm_params, metadata)
+            clean_metadata = {}
+            try:
+                metadata = copy.deepcopy(
+                    metadata
+                )  # Avoid modifying the original metadata
+            except:
+                new_metadata = {}
+                for key, value in metadata.items():
+                    if (
+                        isinstance(value, list)
+                        or isinstance(value, dict)
+                        or isinstance(value, str)
+                        or isinstance(value, int)
+                        or isinstance(value, float)
+                    ):
+                        new_metadata[key] = copy.deepcopy(value)
+                metadata = new_metadata
+
+            tags = []
+            if isinstance(metadata, dict):
+                for key, value in metadata.items():
+
+                    # generate langfuse tags - Default Tags sent to Langfuse from LiteLLM Proxy
+                    if (
+                        litellm._langfuse_default_tags is not None
+                        and isinstance(litellm._langfuse_default_tags, list)
+                        and key in litellm._langfuse_default_tags
+                    ):
+                        tags.append(f"{key}:{value}")
+
+                    # clean litellm metadata before logging
+                    if key in [
+                        "headers",
+                        "endpoint",
+                        "caching_groups",
+                        "previous_models",
+                    ]:
+                        continue
+                    else:
+                        clean_metadata[key] = value
+
+            cost = kwargs.get("response_cost", None)
+            if cost is not None:
+                clean_metadata["litellm_response_cost"] = cost
+
+            metrics: Optional[dict] = None
+            if (
+                response_obj is not None
+                and hasattr(response_obj, "usage")
+                and isinstance(response_obj.usage, litellm.Usage)
+            ):
+                generation_id = litellm.utils.get_logging_id(start_time, response_obj)
+                metrics = {
+                    "prompt_tokens": response_obj.usage.prompt_tokens,
+                    "completion_tokens": response_obj.usage.completion_tokens,
+                    "total_tokens": response_obj.usage.total_tokens,
+                    "total_cost": cost,
+                }
+
+            request_data = {
+                "id": litellm_call_id,
+                "input": prompt,
+                "output": output,
+                "metadata": clean_metadata,
+                "tags": tags,
+            }
+
+            if metrics is not None:
+                request_data["metrics"] = metrics
+
+            try:
+                await global_braintrust_http_handler.post(
+                    url=f"{self.api_base}/project_logs/{project_id}/insert",
+                    json={"events": [request_data]},
+                    headers=self.headers,
+                )
+            except httpx.HTTPStatusError as e:
+                raise Exception(e.response.text)
+        except Exception as e:
+            verbose_logger.error(
+                "Error logging to braintrust - Exception received - {}\n{}".format(
+                    str(e), traceback.format_exc()
+                )
+            )
+            raise e
+
+    def log_failure_event(self, kwargs, response_obj, start_time, end_time):
+        return super().log_failure_event(kwargs, response_obj, start_time, end_time)
--- a/litellm/integrations/langfuse.py
+++ b/litellm/integrations/langfuse.py
@ -8,6 +8,7 @@ from packaging.version import Version

 import litellm
 from litellm._logging import verbose_logger
+from litellm.litellm_core_utils.redact_messages import redact_user_api_key_info


 class LangFuseLogger:
@ -382,6 +383,8 @@ class LangFuseLogger:
            mask_input = clean_metadata.pop("mask_input", False)
            mask_output = clean_metadata.pop("mask_output", False)

+            clean_metadata = redact_user_api_key_info(metadata=clean_metadata)
+
            if trace_name is None and existing_trace_id is None:
                # just log `litellm-{call_type}` as the trace name
                ## DO NOT SET TRACE_NAME if trace-id set. this can lead to overwriting of past traces.
--- a/litellm/integrations/langsmith.py
+++ b/litellm/integrations/langsmith.py
@ -79,6 +79,7 @@ class LangsmithLogger(CustomLogger):
        project_name = metadata.get("project_name", self.langsmith_project)
        run_name = metadata.get("run_name", self.langsmith_default_run_name)
        run_id = metadata.get("id", None)
+        tags = metadata.get("tags", []) or []
        verbose_logger.debug(
            f"Langsmith Logging - project_name: {project_name}, run_name {run_name}"
        )
@ -122,6 +123,7 @@ class LangsmithLogger(CustomLogger):
            "session_name": project_name,
            "start_time": start_time,
            "end_time": end_time,
+            "tags": tags,
        }

        if run_id:
--- a/litellm/integrations/logfire_logger.py
+++ b/litellm/integrations/logfire_logger.py
@ -1,17 +1,21 @@
 #### What this does ####
 #    On success + failure, log events to Logfire

-import dotenv, os
+import os
+
+import dotenv

 dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
 import uuid
-from litellm._logging import print_verbose, verbose_logger
-
 from enum import Enum
 from typing import Any, Dict, NamedTuple
+
 from typing_extensions import LiteralString

+from litellm._logging import print_verbose, verbose_logger
+from litellm.litellm_core_utils.redact_messages import redact_user_api_key_info
+

 class SpanConfig(NamedTuple):
    message_template: LiteralString
@ -135,6 +139,8 @@ class LogfireLogger:
                    else:
                        clean_metadata[key] = value

+            clean_metadata = redact_user_api_key_info(metadata=clean_metadata)
+
            # Build the initial payload
            payload = {
                "id": id,
--- a/litellm/integrations/opentelemetry.py
+++ b/litellm/integrations/opentelemetry.py
@ -2,11 +2,12 @@ import os
 from dataclasses import dataclass
 from datetime import datetime
 from functools import wraps
-from typing import TYPE_CHECKING, Any, Optional, Union
+from typing import TYPE_CHECKING, Any, Dict, Optional, Union

 import litellm
 from litellm._logging import verbose_logger
 from litellm.integrations.custom_logger import CustomLogger
+from litellm.litellm_core_utils.redact_messages import redact_user_api_key_info
 from litellm.types.services import ServiceLoggerPayload

 if TYPE_CHECKING:
@ -27,9 +28,10 @@ else:


 LITELLM_TRACER_NAME = os.getenv("OTEL_TRACER_NAME", "litellm")
-LITELLM_RESOURCE = {
+LITELLM_RESOURCE: Dict[Any, Any] = {
    "service.name": os.getenv("OTEL_SERVICE_NAME", "litellm"),
    "deployment.environment": os.getenv("OTEL_ENVIRONMENT_NAME", "production"),
+    "model_id": os.getenv("OTEL_SERVICE_NAME", "litellm"),
 }
 RAW_REQUEST_SPAN_NAME = "raw_gen_ai_request"
 LITELLM_REQUEST_SPAN_NAME = "litellm_request"
@ -68,7 +70,9 @@ class OpenTelemetryConfig:


 class OpenTelemetry(CustomLogger):
-    def __init__(self, config=OpenTelemetryConfig.from_env()):
+    def __init__(
+        self, config=OpenTelemetryConfig.from_env(), callback_name: Optional[str] = None
+    ):
        from opentelemetry import trace
        from opentelemetry.sdk.resources import Resource
        from opentelemetry.sdk.trace import TracerProvider
@ -79,6 +83,7 @@ class OpenTelemetry(CustomLogger):
        self.OTEL_HEADERS = self.config.headers
        provider = TracerProvider(resource=Resource(attributes=LITELLM_RESOURCE))
        provider.add_span_processor(self._get_span_processor())
+        self.callback_name = callback_name

        trace.set_tracer_provider(provider)
        self.tracer = trace.get_tracer(LITELLM_TRACER_NAME)
@ -120,8 +125,8 @@ class OpenTelemetry(CustomLogger):
        from opentelemetry import trace
        from opentelemetry.trace import Status, StatusCode

-        _start_time_ns = start_time
-        _end_time_ns = end_time
+        _start_time_ns = 0
+        _end_time_ns = 0

        if isinstance(start_time, float):
            _start_time_ns = int(int(start_time) * 1e9)
@ -159,8 +164,8 @@ class OpenTelemetry(CustomLogger):
        from opentelemetry import trace
        from opentelemetry.trace import Status, StatusCode

-        _start_time_ns = start_time
-        _end_time_ns = end_time
+        _start_time_ns = 0
+        _end_time_ns = 0

        if isinstance(start_time, float):
            _start_time_ns = int(int(start_time) * 1e9)
@ -294,6 +299,11 @@ class OpenTelemetry(CustomLogger):
        return isinstance(value, (str, bool, int, float))

    def set_attributes(self, span: Span, kwargs, response_obj):
+        if self.callback_name == "arize":
+            from litellm.integrations.arize_ai import set_arize_ai_attributes
+
+            set_arize_ai_attributes(span, kwargs, response_obj)
+            return
        from litellm.proxy._types import SpanAttributes

        optional_params = kwargs.get("optional_params", {})
@ -306,7 +316,9 @@ class OpenTelemetry(CustomLogger):
        #############################################
        metadata = litellm_params.get("metadata", {}) or {}

-        for key, value in metadata.items():
+        clean_metadata = redact_user_api_key_info(metadata=metadata)
+
+        for key, value in clean_metadata.items():
            if self.is_primitive(value):
                span.set_attribute("metadata.{}".format(key), value)

@ -612,8 +624,8 @@ class OpenTelemetry(CustomLogger):
        from opentelemetry import trace
        from opentelemetry.trace import Status, StatusCode

-        _start_time_ns = logging_payload.start_time
-        _end_time_ns = logging_payload.end_time
+        _start_time_ns = 0
+        _end_time_ns = 0

        start_time = logging_payload.start_time
        end_time = logging_payload.end_time
@ -658,8 +670,8 @@ class OpenTelemetry(CustomLogger):
        from opentelemetry import trace
        from opentelemetry.trace import Status, StatusCode

-        _start_time_ns = logging_payload.start_time
-        _end_time_ns = logging_payload.end_time
+        _start_time_ns = 0
+        _end_time_ns = 0

        start_time = logging_payload.start_time
        end_time = logging_payload.end_time
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@ -53,6 +53,7 @@ from litellm.utils import (
 from ..integrations.aispend import AISpendLogger
 from ..integrations.athina import AthinaLogger
 from ..integrations.berrispend import BerriSpendLogger
+from ..integrations.braintrust_logging import BraintrustLogger
 from ..integrations.clickhouse import ClickhouseLogger
 from ..integrations.custom_logger import CustomLogger
 from ..integrations.datadog import DataDogLogger
@ -1945,7 +1946,14 @@ def _init_custom_logger_compatible_class(
        _openmeter_logger = OpenMeterLogger()
        _in_memory_loggers.append(_openmeter_logger)
        return _openmeter_logger  # type: ignore
+    elif logging_integration == "braintrust":
+        for callback in _in_memory_loggers:
+            if isinstance(callback, BraintrustLogger):
+                return callback  # type: ignore

+        braintrust_logger = BraintrustLogger()
+        _in_memory_loggers.append(braintrust_logger)
+        return braintrust_logger  # type: ignore
    elif logging_integration == "langsmith":
        for callback in _in_memory_loggers:
            if isinstance(callback, LangsmithLogger):
@ -1954,6 +1962,43 @@ def _init_custom_logger_compatible_class(
        _langsmith_logger = LangsmithLogger()
        _in_memory_loggers.append(_langsmith_logger)
        return _langsmith_logger  # type: ignore
+    elif logging_integration == "arize":
+        if "ARIZE_SPACE_KEY" not in os.environ:
+            raise ValueError("ARIZE_SPACE_KEY not found in environment variables")
+        if "ARIZE_API_KEY" not in os.environ:
+            raise ValueError("ARIZE_API_KEY not found in environment variables")
+        from litellm.integrations.opentelemetry import (
+            OpenTelemetry,
+            OpenTelemetryConfig,
+        )
+
+        otel_config = OpenTelemetryConfig(
+            exporter="otlp_grpc",
+            endpoint="https://otlp.arize.com/v1",
+        )
+        os.environ["OTEL_EXPORTER_OTLP_TRACES_HEADERS"] = (
+            f"space_key={os.getenv('ARIZE_SPACE_KEY')},api_key={os.getenv('ARIZE_API_KEY')}"
+        )
+        for callback in _in_memory_loggers:
+            if (
+                isinstance(callback, OpenTelemetry)
+                and callback.callback_name == "arize"
+            ):
+                return callback  # type: ignore
+        _otel_logger = OpenTelemetry(config=otel_config, callback_name="arize")
+        _in_memory_loggers.append(_otel_logger)
+        return _otel_logger  # type: ignore
+
+    elif logging_integration == "otel":
+        from litellm.integrations.opentelemetry import OpenTelemetry
+
+        for callback in _in_memory_loggers:
+            if isinstance(callback, OpenTelemetry):
+                return callback  # type: ignore
+
+        otel_logger = OpenTelemetry()
+        _in_memory_loggers.append(otel_logger)
+        return otel_logger  # type: ignore

    elif logging_integration == "galileo":
        for callback in _in_memory_loggers:
@ -2019,6 +2064,10 @@ def get_custom_logger_compatible_class(
        for callback in _in_memory_loggers:
            if isinstance(callback, OpenMeterLogger):
                return callback
+    elif logging_integration == "braintrust":
+        for callback in _in_memory_loggers:
+            if isinstance(callback, BraintrustLogger):
+                return callback
    elif logging_integration == "galileo":
        for callback in _in_memory_loggers:
            if isinstance(callback, GalileoObserve):
@ -2027,6 +2076,25 @@ def get_custom_logger_compatible_class(
        for callback in _in_memory_loggers:
            if isinstance(callback, LangsmithLogger):
                return callback
+    elif logging_integration == "otel":
+        from litellm.integrations.opentelemetry import OpenTelemetry
+
+        for callback in _in_memory_loggers:
+            if isinstance(callback, OpenTelemetry):
+                return callback
+    elif logging_integration == "arize":
+        from litellm.integrations.opentelemetry import OpenTelemetry
+
+        if "ARIZE_SPACE_KEY" not in os.environ:
+            raise ValueError("ARIZE_SPACE_KEY not found in environment variables")
+        if "ARIZE_API_KEY" not in os.environ:
+            raise ValueError("ARIZE_API_KEY not found in environment variables")
+        for callback in _in_memory_loggers:
+            if (
+                isinstance(callback, OpenTelemetry)
+                and callback.callback_name == "arize"
+            ):
+                return callback
    elif logging_integration == "logfire":
        if "LOGFIRE_TOKEN" not in os.environ:
            raise ValueError("LOGFIRE_TOKEN not found in environment variables")
--- a/litellm/litellm_core_utils/redact_messages.py
+++ b/litellm/litellm_core_utils/redact_messages.py
@ -87,3 +87,33 @@ def redact_message_input_output_from_logging(

    # by default return result
    return result
+
+
+def redact_user_api_key_info(metadata: dict) -> dict:
+    """
+    removes any user_api_key_info before passing to logging object, if flag set
+
+    Usage:
+
+    SDK
+    ```python
+    litellm.redact_user_api_key_info = True
+    ```
+
+    PROXY:
+    ```yaml
+    litellm_settings:
+        redact_user_api_key_info: true
+    ```
+    """
+    if litellm.redact_user_api_key_info is not True:
+        return metadata
+
+    new_metadata = {}
+    for k, v in metadata.items():
+        if isinstance(k, str) and k.startswith("user_api_key"):
+            pass
+        else:
+            new_metadata[k] = v
+
+    return new_metadata
--- a/litellm/llms/anthropic.py
+++ b/litellm/llms/anthropic.py
@ -385,6 +385,11 @@ class AnthropicConfig:
            if "user_id" in anthropic_message_request["metadata"]:
                new_kwargs["user"] = anthropic_message_request["metadata"]["user_id"]

+        # Pass litellm proxy specific metadata
+        if "litellm_metadata" in anthropic_message_request:
+            # metadata will be passed to litellm.acompletion(), it's a litellm_param
+            new_kwargs["metadata"] = anthropic_message_request.pop("litellm_metadata")
+
        ## CONVERT TOOL CHOICE
        if "tool_choice" in anthropic_message_request:
            new_kwargs["tool_choice"] = self.translate_anthropic_tool_choice_to_openai(
@ -775,8 +780,17 @@ class AnthropicChatCompletion(BaseLLM):
            system_prompt = ""
            for idx, message in enumerate(messages):
                if message["role"] == "system":
-                    system_prompt += message["content"]
-                    system_prompt_indices.append(idx)
+                    valid_content: bool = False
+                    if isinstance(message["content"], str):
+                        system_prompt += message["content"]
+                        valid_content = True
+                    elif isinstance(message["content"], list):
+                        for content in message["content"]:
+                            system_prompt += content.get("text", "")
+                        valid_content = True
+
+                    if valid_content:
+                        system_prompt_indices.append(idx)
            if len(system_prompt_indices) > 0:
                for idx in reversed(system_prompt_indices):
                    messages.pop(idx)
--- a/litellm/llms/bedrock_httpx.py
+++ b/litellm/llms/bedrock_httpx.py
@ -76,6 +76,8 @@ BEDROCK_CONVERSE_MODELS = [
    "anthropic.claude-v1",
    "anthropic.claude-instant-v1",
    "ai21.jamba-instruct-v1:0",
+    "meta.llama3-1-8b-instruct-v1:0",
+    "meta.llama3-1-70b-instruct-v1:0",
 ]


@ -1729,7 +1731,7 @@ class BedrockConverseLLM(BaseLLM):
        headers={},
        client: Optional[AsyncHTTPHandler] = None,
    ) -> Union[ModelResponse, CustomStreamWrapper]:
-        if client is None:
+        if client is None or not isinstance(client, AsyncHTTPHandler):
            _params = {}
            if timeout is not None:
                if isinstance(timeout, float) or isinstance(timeout, int):
--- a/litellm/llms/openai.py
+++ b/litellm/llms/openai.py
@ -968,7 +968,7 @@ class OpenAIChatCompletion(BaseLLM):
                except openai.UnprocessableEntityError as e:
                    ## check if body contains unprocessable params - related issue https://github.com/BerriAI/litellm/issues/4800
                    if litellm.drop_params is True or drop_params is True:
-                        if e.body is not None and e.body.get("detail"):  # type: ignore
+                        if e.body is not None and isinstance(e.body, dict) and e.body.get("detail"):  # type: ignore
                            detail = e.body.get("detail")  # type: ignore
                            invalid_params: List[str] = []
                            if (
@ -1100,7 +1100,7 @@ class OpenAIChatCompletion(BaseLLM):
            except openai.UnprocessableEntityError as e:
                ## check if body contains unprocessable params - related issue https://github.com/BerriAI/litellm/issues/4800
                if litellm.drop_params is True or drop_params is True:
-                    if e.body is not None and e.body.get("detail"):  # type: ignore
+                    if e.body is not None and isinstance(e.body, dict) and e.body.get("detail"):  # type: ignore
                        detail = e.body.get("detail")  # type: ignore
                        invalid_params: List[str] = []
                        if (
@ -1231,7 +1231,7 @@ class OpenAIChatCompletion(BaseLLM):
            except openai.UnprocessableEntityError as e:
                ## check if body contains unprocessable params - related issue https://github.com/BerriAI/litellm/issues/4800
                if litellm.drop_params is True or drop_params is True:
-                    if e.body is not None and e.body.get("detail"):  # type: ignore
+                    if e.body is not None and isinstance(e.body, dict) and e.body.get("detail"):  # type: ignore
                        detail = e.body.get("detail")  # type: ignore
                        invalid_params: List[str] = []
                        if (
--- a/litellm/llms/triton.py
+++ b/litellm/llms/triton.py
@ -1,23 +1,31 @@
-import copy
 import json
 import os
 import time
-import types
 from enum import Enum
-from typing import Callable, List, Optional
+from typing import Any, Callable, Dict, List, Optional, Sequence, Union

 import httpx  # type: ignore
 import requests  # type: ignore

 import litellm
-from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
+from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
+from litellm.utils import (
+    Choices,
+    CustomStreamWrapper,
+    Delta,
+    EmbeddingResponse,
+    Message,
+    ModelResponse,
+    Usage,
+    map_finish_reason,
+)

 from .base import BaseLLM
 from .prompt_templates.factory import custom_prompt, prompt_factory


 class TritonError(Exception):
-    def __init__(self, status_code, message):
+    def __init__(self, status_code: int, message: str) -> None:
        self.status_code = status_code
        self.message = message
        self.request = httpx.Request(
@ -41,8 +49,7 @@ class TritonChatCompletion(BaseLLM):
        api_base: str,
        logging_obj=None,
        api_key: Optional[str] = None,
-    ):
-
+    ) -> EmbeddingResponse:
        async_handler = AsyncHTTPHandler(
            timeout=httpx.Timeout(timeout=600.0, connect=5.0)
        )
@ -79,10 +86,10 @@ class TritonChatCompletion(BaseLLM):

        return model_response

-    def embedding(
+    async def embedding(
        self,
        model: str,
-        input: list,
+        input: List[str],
        timeout: float,
        api_base: str,
        model_response: litellm.utils.EmbeddingResponse,
@ -90,8 +97,8 @@ class TritonChatCompletion(BaseLLM):
        logging_obj=None,
        optional_params=None,
        client=None,
-        aembedding=None,
-    ):
+        aembedding: bool = False,
+    ) -> EmbeddingResponse:
        data_for_triton = {
            "inputs": [
                {
@ -103,8 +110,6 @@ class TritonChatCompletion(BaseLLM):
            ]
        }

-        ## LOGGING
-
        curl_string = f"curl {api_base} -X POST -H 'Content-Type: application/json' -d '{data_for_triton}'"

        logging_obj.pre_call(
@ -116,8 +121,8 @@ class TritonChatCompletion(BaseLLM):
            },
        )

-        if aembedding == True:
-            response = self.aembedding(
+        if aembedding:
+            response = await self.aembedding(
                data=data_for_triton,
                model_response=model_response,
                logging_obj=logging_obj,
@ -130,6 +135,198 @@ class TritonChatCompletion(BaseLLM):
                "Only async embedding supported for triton, please use litellm.aembedding() for now"
            )

+    def completion(
+        self,
+        model: str,
+        messages: List[dict],
+        timeout: float,
+        api_base: str,
+        model_response: ModelResponse,
+        api_key: Optional[str] = None,
+        logging_obj=None,
+        optional_params=None,
+        client=None,
+        stream: Optional[bool] = False,
+        acompletion: bool = False,
+    ) -> ModelResponse:
+        type_of_model = ""
+        optional_params.pop("stream", False)
+        if api_base.endswith("generate"):  ### This is a trtllm model
+            text_input = messages[0]["content"]
+            data_for_triton: Dict[str, Any] = {
+                "text_input": prompt_factory(model=model, messages=messages),
+                "parameters": {
+                    "max_tokens": int(optional_params.get("max_tokens", 2000)),
+                    "bad_words": [""],
+                    "stop_words": [""],
+                },
+                "stream": bool(stream),
+            }
+            data_for_triton["parameters"].update(optional_params)
+            type_of_model = "trtllm"
+
+        elif api_base.endswith(
+            "infer"
+        ):  ### This is an infer model with a custom model on triton
+            text_input = messages[0]["content"]
+            data_for_triton = {
+                "inputs": [
+                    {
+                        "name": "text_input",
+                        "shape": [1],
+                        "datatype": "BYTES",
+                        "data": [text_input],
+                    }
+                ]
+            }
+
+            for k, v in optional_params.items():
+                if not (k == "stream" or k == "max_retries"):
+                    datatype = "INT32" if isinstance(v, int) else "BYTES"
+                    datatype = "FP32" if isinstance(v, float) else datatype
+                    data_for_triton["inputs"].append(
+                        {"name": k, "shape": [1], "datatype": datatype, "data": [v]}
+                    )
+
+            if "max_tokens" not in optional_params:
+                data_for_triton["inputs"].append(
+                    {
+                        "name": "max_tokens",
+                        "shape": [1],
+                        "datatype": "INT32",
+                        "data": [20],
+                    }
+                )
+
+            type_of_model = "infer"
+        else:  ## Unknown model type passthrough
+            data_for_triton = {
+                "inputs": [
+                    {
+                        "name": "text_input",
+                        "shape": [1],
+                        "datatype": "BYTES",
+                        "data": [messages[0]["content"]],
+                    }
+                ]
+            }
+
+        if logging_obj:
+            logging_obj.pre_call(
+                input=messages,
+                api_key=api_key,
+                additional_args={
+                    "complete_input_dict": optional_params,
+                    "api_base": api_base,
+                    "http_client": client,
+                },
+            )
+
+        headers = {"Content-Type": "application/json"}
+        json_data_for_triton: str = json.dumps(data_for_triton)
+
+        if acompletion:
+            return self.acompletion(  # type: ignore
+                model,
+                json_data_for_triton,
+                headers=headers,
+                logging_obj=logging_obj,
+                api_base=api_base,
+                stream=stream,
+                model_response=model_response,
+                type_of_model=type_of_model,
+            )
+        else:
+            handler = HTTPHandler()
+        if stream:
+            return self._handle_stream(
+                handler, api_base, data_for_triton, model, logging_obj
+            )
+        else:
+            response = handler.post(url=api_base, data=data_for_triton, headers=headers)
+            return self._handle_response(
+                response, model_response, logging_obj, type_of_model=type_of_model
+            )
+
+    async def acompletion(
+        self,
+        model: str,
+        data_for_triton,
+        api_base,
+        stream,
+        logging_obj,
+        headers,
+        model_response,
+        type_of_model,
+    ) -> ModelResponse:
+        handler = AsyncHTTPHandler()
+        if stream:
+            return self._ahandle_stream(
+                handler, api_base, data_for_triton, model, logging_obj
+            )
+        else:
+            response = await handler.post(
+                url=api_base, data=data_for_triton, headers=headers
+            )
+
+            return self._handle_response(
+                response, model_response, logging_obj, type_of_model=type_of_model
+            )
+
+    def _handle_stream(self, handler, api_base, data_for_triton, model, logging_obj):
+        response = handler.post(
+            url=api_base + "_stream", data=data_for_triton, stream=True
+        )
+        streamwrapper = litellm.CustomStreamWrapper(
+            response.iter_lines(),
+            model=model,
+            custom_llm_provider="triton",
+            logging_obj=logging_obj,
+        )
+        for chunk in streamwrapper:
+            yield (chunk)
+
+    async def _ahandle_stream(
+        self, handler, api_base, data_for_triton, model, logging_obj
+    ):
+        response = await handler.post(
+            url=api_base + "_stream", data=data_for_triton, stream=True
+        )
+        streamwrapper = litellm.CustomStreamWrapper(
+            response.aiter_lines(),
+            model=model,
+            custom_llm_provider="triton",
+            logging_obj=logging_obj,
+        )
+        async for chunk in streamwrapper:
+            yield (chunk)
+
+    def _handle_response(self, response, model_response, logging_obj, type_of_model):
+        if logging_obj:
+            logging_obj.post_call(original_response=response)
+
+        if response.status_code != 200:
+            raise TritonError(status_code=response.status_code, message=response.text)
+
+        _json_response = response.json()
+        model_response.model = _json_response.get("model_name", "None")
+        if type_of_model == "trtllm":
+            model_response.choices = [
+                Choices(index=0, message=Message(content=_json_response["text_output"]))
+            ]
+        elif type_of_model == "infer":
+            model_response.choices = [
+                Choices(
+                    index=0,
+                    message=Message(content=_json_response["outputs"][0]["data"]),
+                )
+            ]
+        else:
+            model_response.choices = [
+                Choices(index=0, message=Message(content=_json_response["outputs"]))
+            ]
+        return model_response
+
    @staticmethod
    def split_embedding_by_shape(
        data: List[float], shape: List[int]
--- a/litellm/llms/vertex_ai_llama.py
+++ b/litellm/llms/vertex_ai_llama.py
@ -0,0 +1,203 @@
+# What is this?
+## Handler for calling llama 3.1 API on Vertex AI
+import copy
+import json
+import os
+import time
+import types
+import uuid
+from enum import Enum
+from typing import Any, Callable, List, Optional, Tuple, Union
+
+import httpx  # type: ignore
+import requests  # type: ignore
+
+import litellm
+from litellm.litellm_core_utils.core_helpers import map_finish_reason
+from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
+from litellm.types.llms.anthropic import (
+    AnthropicMessagesTool,
+    AnthropicMessagesToolChoice,
+)
+from litellm.types.llms.openai import (
+    ChatCompletionToolParam,
+    ChatCompletionToolParamFunctionChunk,
+)
+from litellm.types.utils import ResponseFormatChunk
+from litellm.utils import CustomStreamWrapper, ModelResponse, Usage
+
+from .base import BaseLLM
+from .prompt_templates.factory import (
+    construct_tool_use_system_prompt,
+    contains_tag,
+    custom_prompt,
+    extract_between_tags,
+    parse_xml_params,
+    prompt_factory,
+    response_schema_prompt,
+)
+
+
+class VertexAIError(Exception):
+    def __init__(self, status_code, message):
+        self.status_code = status_code
+        self.message = message
+        self.request = httpx.Request(
+            method="POST", url=" https://cloud.google.com/vertex-ai/"
+        )
+        self.response = httpx.Response(status_code=status_code, request=self.request)
+        super().__init__(
+            self.message
+        )  # Call the base class constructor with the parameters it needs
+
+
+class VertexAILlama3Config:
+    """
+    Reference:https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/llama#streaming
+
+    The class `VertexAILlama3Config` provides configuration for the VertexAI's Llama API interface. Below are the parameters:
+
+    - `max_tokens` Required (integer) max tokens,
+
+    Note: Please make sure to modify the default parameters as required for your use case.
+    """
+
+    max_tokens: Optional[int] = None
+
+    def __init__(
+        self,
+        max_tokens: Optional[int] = None,
+    ) -> None:
+        locals_ = locals()
+        for key, value in locals_.items():
+            if key == "max_tokens" and value is None:
+                value = self.max_tokens
+            if key != "self" and value is not None:
+                setattr(self.__class__, key, value)
+
+    @classmethod
+    def get_config(cls):
+        return {
+            k: v
+            for k, v in cls.__dict__.items()
+            if not k.startswith("__")
+            and not isinstance(
+                v,
+                (
+                    types.FunctionType,
+                    types.BuiltinFunctionType,
+                    classmethod,
+                    staticmethod,
+                ),
+            )
+            and v is not None
+        }
+
+    def get_supported_openai_params(self):
+        return [
+            "max_tokens",
+            "stream",
+        ]
+
+    def map_openai_params(self, non_default_params: dict, optional_params: dict):
+        for param, value in non_default_params.items():
+            if param == "max_tokens":
+                optional_params["max_tokens"] = value
+
+        return optional_params
+
+
+class VertexAILlama3(BaseLLM):
+    def __init__(self) -> None:
+        pass
+
+    def create_vertex_llama3_url(
+        self, vertex_location: str, vertex_project: str
+    ) -> str:
+        return f"https://{vertex_location}-aiplatform.googleapis.com/v1beta1/projects/{vertex_project}/locations/{vertex_location}/endpoints/openapi"
+
+    def completion(
+        self,
+        model: str,
+        messages: list,
+        model_response: ModelResponse,
+        print_verbose: Callable,
+        encoding,
+        logging_obj,
+        optional_params: dict,
+        custom_prompt_dict: dict,
+        headers: Optional[dict],
+        timeout: Union[float, httpx.Timeout],
+        vertex_project=None,
+        vertex_location=None,
+        vertex_credentials=None,
+        litellm_params=None,
+        logger_fn=None,
+        acompletion: bool = False,
+        client=None,
+    ):
+        try:
+            import vertexai
+            from google.cloud import aiplatform
+
+            from litellm.llms.openai import OpenAIChatCompletion
+            from litellm.llms.vertex_httpx import VertexLLM
+        except Exception:
+
+            raise VertexAIError(
+                status_code=400,
+                message="""vertexai import failed please run `pip install -U "google-cloud-aiplatform>=1.38"`""",
+            )
+
+        if not (
+            hasattr(vertexai, "preview") or hasattr(vertexai.preview, "language_models")
+        ):
+            raise VertexAIError(
+                status_code=400,
+                message="""Upgrade vertex ai. Run `pip install "google-cloud-aiplatform>=1.38"`""",
+            )
+        try:
+
+            vertex_httpx_logic = VertexLLM()
+
+            access_token, project_id = vertex_httpx_logic._ensure_access_token(
+                credentials=vertex_credentials, project_id=vertex_project
+            )
+
+            openai_chat_completions = OpenAIChatCompletion()
+
+            ## Load Config
+            # config = litellm.VertexAILlama3.get_config()
+            # for k, v in config.items():
+            #     if k not in optional_params:
+            #         optional_params[k] = v
+
+            ## CONSTRUCT API BASE
+            stream: bool = optional_params.get("stream", False) or False
+
+            optional_params["stream"] = stream
+
+            api_base = self.create_vertex_llama3_url(
+                vertex_location=vertex_location or "us-central1",
+                vertex_project=vertex_project or project_id,
+            )
+
+            return openai_chat_completions.completion(
+                model=model,
+                messages=messages,
+                api_base=api_base,
+                api_key=access_token,
+                custom_prompt_dict=custom_prompt_dict,
+                model_response=model_response,
+                print_verbose=print_verbose,
+                logging_obj=logging_obj,
+                optional_params=optional_params,
+                acompletion=acompletion,
+                litellm_params=litellm_params,
+                logger_fn=logger_fn,
+                client=client,
+                timeout=timeout,
+            )
+
+        except Exception as e:
+            raise VertexAIError(status_code=500, message=str(e))
--- a/litellm/llms/vertex_httpx.py
+++ b/litellm/llms/vertex_httpx.py
@ -1033,7 +1033,7 @@ class VertexLLM(BaseLLM):
                model=model, custom_llm_provider=_custom_llm_provider
            )
        except Exception as e:
-            verbose_logger.error(
+            verbose_logger.warning(
                "Unable to identify if system message supported. Defaulting to 'False'. Received error message - {}\nAdd it here - https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json".format(
                    str(e)
                )
@ -1189,7 +1189,7 @@ class VertexLLM(BaseLLM):
            response.raise_for_status()
        except httpx.HTTPStatusError as err:
            error_code = err.response.status_code
-            raise VertexAIError(status_code=error_code, message=response.text)
+            raise VertexAIError(status_code=error_code, message=err.response.text)
        except httpx.TimeoutException:
            raise VertexAIError(status_code=408, message="Timeout error occurred.")

--- a/litellm/main.py
+++ b/litellm/main.py
@ -120,6 +120,7 @@ from .llms.prompt_templates.factory import (
 )
 from .llms.text_completion_codestral import CodestralTextCompletion
 from .llms.triton import TritonChatCompletion
+from .llms.vertex_ai_llama import VertexAILlama3
 from .llms.vertex_httpx import VertexLLM
 from .llms.watsonx import IBMWatsonXAI
 from .types.llms.openai import HttpxBinaryResponseContent
@ -156,6 +157,7 @@ triton_chat_completions = TritonChatCompletion()
 bedrock_chat_completion = BedrockLLM()
 bedrock_converse_chat_completion = BedrockConverseLLM()
 vertex_chat_completion = VertexLLM()
+vertex_llama_chat_completion = VertexAILlama3()
 watsonxai = IBMWatsonXAI()
 ####### COMPLETION ENDPOINTS ################

@ -375,6 +377,7 @@ async def acompletion(
            or custom_llm_provider == "predibase"
            or custom_llm_provider == "bedrock"
            or custom_llm_provider == "databricks"
+            or custom_llm_provider == "triton"
            or custom_llm_provider == "clarifai"
            or custom_llm_provider == "watsonx"
            or custom_llm_provider in litellm.openai_compatible_providers
@ -1491,6 +1494,10 @@ def completion(
                    or get_secret("ANTHROPIC_BASE_URL")
                    or "https://api.anthropic.com/v1/complete"
                )
+
+                if api_base is not None and not api_base.endswith("/v1/complete"):
+                    api_base += "/v1/complete"
+
                response = anthropic_text_completions.completion(
                    model=model,
                    messages=messages,
@ -1517,6 +1524,10 @@ def completion(
                    or get_secret("ANTHROPIC_BASE_URL")
                    or "https://api.anthropic.com/v1/messages"
                )
+
+                if api_base is not None and not api_base.endswith("/v1/messages"):
+                    api_base += "/v1/messages"
+
                response = anthropic_chat_completions.completion(
                    model=model,
                    messages=messages,
@ -2055,7 +2066,26 @@ def completion(
                    timeout=timeout,
                    client=client,
                )
-
+            elif model.startswith("meta/"):
+                model_response = vertex_llama_chat_completion.completion(
+                    model=model,
+                    messages=messages,
+                    model_response=model_response,
+                    print_verbose=print_verbose,
+                    optional_params=new_params,
+                    litellm_params=litellm_params,
+                    logger_fn=logger_fn,
+                    encoding=encoding,
+                    vertex_location=vertex_ai_location,
+                    vertex_project=vertex_ai_project,
+                    vertex_credentials=vertex_credentials,
+                    logging_obj=logging,
+                    acompletion=acompletion,
+                    headers=headers,
+                    custom_prompt_dict=custom_prompt_dict,
+                    timeout=timeout,
+                    client=client,
+                )
            else:
                model_response = vertex_ai.completion(
                    model=model,
@ -2469,6 +2499,25 @@ def completion(
                return generator

            response = generator
+
+        elif custom_llm_provider == "triton":
+            api_base = litellm.api_base or api_base
+            model_response = triton_chat_completions.completion(
+                api_base=api_base,
+                timeout=timeout,  # type: ignore
+                model=model,
+                messages=messages,
+                model_response=model_response,
+                optional_params=optional_params,
+                logging_obj=logging,
+                stream=stream,
+                acompletion=acompletion,
+            )
+
+            ## RESPONSE OBJECT
+            response = model_response
+            return response
+
        elif custom_llm_provider == "cloudflare":
            api_key = (
                api_key
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -760,6 +760,36 @@
        "litellm_provider": "azure_ai",
        "mode": "chat"
    },
+    "azure_ai/Meta-Llama-31-8B-Instruct": {
+        "max_tokens": 128000,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 128000,
+        "input_cost_per_token": 0.0000003,
+        "output_cost_per_token": 0.00000061,
+        "litellm_provider": "azure_ai",
+        "mode": "chat",
+        "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-8b-instruct-offer?tab=PlansAndPrice"
+    },
+    "azure_ai/Meta-Llama-31-70B-Instruct": {
+        "max_tokens": 128000,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 128000,
+        "input_cost_per_token": 0.00000268,
+        "output_cost_per_token": 0.00000354,
+        "litellm_provider": "azure_ai",
+        "mode": "chat",
+        "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-70b-instruct-offer?tab=PlansAndPrice"
+    },
+    "azure_ai/Meta-Llama-31-405B-Instruct": {
+        "max_tokens": 128000,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 128000,
+        "input_cost_per_token": 0.00000533,
+        "output_cost_per_token": 0.000016,
+        "litellm_provider": "azure_ai",
+        "mode": "chat",
+        "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-405b-instruct-offer?tab=PlansAndPrice"
+    },
    "babbage-002": {
        "max_tokens": 16384,
        "max_input_tokens": 16384,
@ -1948,6 +1978,16 @@
        "supports_function_calling": true,
        "supports_vision": true
    },
+    "vertex_ai/meta/llama3-405b-instruct-maas": {
+        "max_tokens": 32000,
+        "max_input_tokens": 32000,
+        "max_output_tokens": 32000,
+        "input_cost_per_token": 0.0,
+        "output_cost_per_token": 0.0,
+        "litellm_provider": "vertex_ai-llama_models",
+        "mode": "chat",
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/pricing#partner-models"
+    },
    "vertex_ai/imagegeneration@006": {
        "cost_per_image": 0.020,
        "litellm_provider": "vertex_ai-image-models",
@ -3633,6 +3673,24 @@
        "litellm_provider": "bedrock",
        "mode": "chat"
    },
+    "meta.llama3-1-8b-instruct-v1:0": {
+        "max_tokens": 128000,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 2048,
+        "input_cost_per_token": 0.0000004,
+        "output_cost_per_token": 0.0000006,
+        "litellm_provider": "bedrock",
+        "mode": "chat"
+    },
+    "meta.llama3-1-70b-instruct-v1:0": {
+        "max_tokens": 128000,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 2048,
+        "input_cost_per_token": 0.00000265,
+        "output_cost_per_token": 0.0000035,
+        "litellm_provider": "bedrock",
+        "mode": "chat"
+    },
    "512-x-512/50-steps/stability.stable-diffusion-xl-v0": {
        "max_tokens": 77, 
        "max_input_tokens": 77, 
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@ -1,5 +1,8 @@
 model_list:
-  - model_name: groq-llama3
+  - model_name: "*"             # all requests where model not in your config go to this deployment
    litellm_params:
-      model: groq/llama3-groq-70b-8192-tool-use-preview
-      api_key: os.environ/GROQ_API_KEY
+      model: "openai/*"           # passes our validation check that a real provider is given
+      api_key: "" 
+
+general_settings:
+  completion_model: "gpt-3.5-turbo"
--- a/litellm/proxy/_types.py
+++ b/litellm/proxy/_types.py
@ -228,6 +228,10 @@ class LiteLLMRoutes(enum.Enum):
        "/utils/token_counter",
    ]

+    anthropic_routes: List = [
+        "/v1/messages",
+    ]
+
    info_routes: List = [
        "/key/info",
        "/team/info",
@ -880,6 +884,26 @@ class BlockTeamRequest(LiteLLMBase):
    team_id: str  # required


+class AddTeamCallback(LiteLLMBase):
+    callback_name: str
+    callback_type: Literal["success", "failure", "success_and_failure"]
+    # for now - only supported for langfuse
+    callback_vars: Dict[
+        Literal["langfuse_public_key", "langfuse_secret_key", "langfuse_host"], str
+    ]
+
+
+class TeamCallbackMetadata(LiteLLMBase):
+    success_callback: Optional[List[str]] = []
+    failure_callback: Optional[List[str]] = []
+    # for now - only supported for langfuse
+    callback_vars: Optional[
+        Dict[
+            Literal["langfuse_public_key", "langfuse_secret_key", "langfuse_host"], str
+        ]
+    ] = {}
+
+
 class LiteLLM_TeamTable(TeamBase):
    spend: Optional[float] = None
    max_parallel_requests: Optional[int] = None
@ -1232,6 +1256,7 @@ class LiteLLM_VerificationTokenView(LiteLLM_VerificationToken):
    soft_budget: Optional[float] = None
    team_model_aliases: Optional[Dict] = None
    team_member_spend: Optional[float] = None
+    team_metadata: Optional[Dict] = None

    # End User Params
    end_user_id: Optional[str] = None
@ -1677,3 +1702,5 @@ class ProxyErrorTypes(str, enum.Enum):
    budget_exceeded = "budget_exceeded"
    expired_key = "expired_key"
    auth_error = "auth_error"
+    internal_server_error = "internal_server_error"
+    bad_request_error = "bad_request_error"
--- a/litellm/proxy/auth/auth_checks.py
+++ b/litellm/proxy/auth/auth_checks.py
@ -24,7 +24,7 @@ from litellm.proxy._types import (
    LitellmUserRoles,
    UserAPIKeyAuth,
 )
-from litellm.proxy.auth.auth_utils import is_openai_route
+from litellm.proxy.auth.auth_utils import is_llm_api_route
 from litellm.proxy.utils import PrismaClient, ProxyLogging, log_to_opentelemetry
 from litellm.types.services import ServiceLoggerPayload, ServiceTypes

@ -57,6 +57,7 @@ def common_checks(
    4. If end_user (either via JWT or 'user' passed to /chat/completions, /embeddings endpoint) is in budget
    5. [OPTIONAL] If 'enforce_end_user' enabled - did developer pass in 'user' param for openai endpoints
    6. [OPTIONAL] If 'litellm.max_budget' is set (>0), is proxy under budget
+    7. [OPTIONAL] If guardrails modified - is request allowed to change this
    """
    _model = request_body.get("model", None)
    if team_object is not None and team_object.blocked is True:
@ -106,7 +107,7 @@ def common_checks(
        general_settings.get("enforce_user_param", None) is not None
        and general_settings["enforce_user_param"] == True
    ):
-        if is_openai_route(route=route) and "user" not in request_body:
+        if is_llm_api_route(route=route) and "user" not in request_body:
            raise Exception(
                f"'user' param not passed in. 'enforce_user_param'={general_settings['enforce_user_param']}"
            )
@ -122,7 +123,7 @@ def common_checks(
                + CommonProxyErrors.not_premium_user.value
            )

-        if is_openai_route(route=route):
+        if is_llm_api_route(route=route):
            # loop through each enforced param
            # example enforced_params ['user', 'metadata', 'metadata.generation_name']
            for enforced_param in general_settings["enforced_params"]:
@ -150,7 +151,7 @@ def common_checks(
        and global_proxy_spend is not None
        # only run global budget checks for OpenAI routes
        # Reason - the Admin UI should continue working if the proxy crosses it's global budget
-        and is_openai_route(route=route)
+        and is_llm_api_route(route=route)
        and route != "/v1/models"
        and route != "/models"
    ):
@ -158,6 +159,22 @@ def common_checks(
            raise litellm.BudgetExceededError(
                current_cost=global_proxy_spend, max_budget=litellm.max_budget
            )
+
+    _request_metadata: dict = request_body.get("metadata", {}) or {}
+    if _request_metadata.get("guardrails"):
+        # check if team allowed to modify guardrails
+        from litellm.proxy.guardrails.guardrail_helpers import can_modify_guardrails
+
+        can_modify: bool = can_modify_guardrails(team_object)
+        if can_modify is False:
+            from fastapi import HTTPException
+
+            raise HTTPException(
+                status_code=403,
+                detail={
+                    "error": "Your team does not have permission to modify guardrails."
+                },
+            )
    return True


--- a/litellm/proxy/auth/auth_utils.py
+++ b/litellm/proxy/auth/auth_utils.py
@ -46,7 +46,7 @@ def route_in_additonal_public_routes(current_route: str):
        return False


-def is_openai_route(route: str) -> bool:
+def is_llm_api_route(route: str) -> bool:
    """
    Helper to checks if provided route is an OpenAI route

@ -59,6 +59,9 @@ def is_openai_route(route: str) -> bool:
    if route in LiteLLMRoutes.openai_routes.value:
        return True

+    if route in LiteLLMRoutes.anthropic_routes.value:
+        return True
+
    # fuzzy match routes like "/v1/threads/thread_49EIN5QF32s4mH20M7GFKdlZ"
    # Check for routes with placeholders
    for openai_route in LiteLLMRoutes.openai_routes.value:
--- a/litellm/proxy/auth/user_api_key_auth.py
+++ b/litellm/proxy/auth/user_api_key_auth.py
@ -57,7 +57,7 @@ from litellm.proxy.auth.auth_checks import (
    log_to_opentelemetry,
 )
 from litellm.proxy.auth.auth_utils import (
-    is_openai_route,
+    is_llm_api_route,
    route_in_additonal_public_routes,
 )
 from litellm.proxy.common_utils.http_parsing_utils import _read_request_body
@ -924,6 +924,7 @@ async def user_api_key_auth(
                rpm_limit=valid_token.team_rpm_limit,
                blocked=valid_token.team_blocked,
                models=valid_token.team_models,
+                metadata=valid_token.team_metadata,
            )

            user_api_key_cache.set_cache(
@ -994,9 +995,9 @@ async def user_api_key_auth(
            _user_role = _get_user_role(user_id_information=user_id_information)

            if not _is_user_proxy_admin(user_id_information):  # if non-admin
-                if is_openai_route(route=route):
+                if is_llm_api_route(route=route):
                    pass
-                elif is_openai_route(route=request["route"].name):
+                elif is_llm_api_route(route=request["route"].name):
                    pass
                elif (
                    route in LiteLLMRoutes.info_routes.value
@ -1049,7 +1050,7 @@ async def user_api_key_auth(

                    pass
                elif _user_role == LitellmUserRoles.PROXY_ADMIN_VIEW_ONLY.value:
-                    if is_openai_route(route=route):
+                    if is_llm_api_route(route=route):
                        raise HTTPException(
                            status_code=status.HTTP_403_FORBIDDEN,
                            detail=f"user not allowed to access this OpenAI routes, role= {_user_role}",
--- a/litellm/proxy/common_utils/init_callbacks.py
+++ b/litellm/proxy/common_utils/init_callbacks.py
@ -23,11 +23,11 @@ def initialize_callbacks_on_proxy(
    )
    if isinstance(value, list):
        imported_list: List[Any] = []
-        known_compatible_callbacks = list(
-            get_args(litellm._custom_logger_compatible_callbacks_literal)
-        )
        for callback in value:  # ["presidio", <my-custom-callback>]
-            if isinstance(callback, str) and callback in known_compatible_callbacks:
+            if (
+                isinstance(callback, str)
+                and callback in litellm._known_custom_logger_compatible_callbacks
+            ):
                imported_list.append(callback)
            elif isinstance(callback, str) and callback == "otel":
                from litellm.integrations.opentelemetry import OpenTelemetry
--- a/litellm/proxy/guardrails/guardrail_helpers.py
+++ b/litellm/proxy/guardrails/guardrail_helpers.py
@ -1,9 +1,26 @@
+from typing import Dict
+
 import litellm
 from litellm._logging import verbose_proxy_logger
-from litellm.proxy.proxy_server import UserAPIKeyAuth
+from litellm.proxy.proxy_server import LiteLLM_TeamTable, UserAPIKeyAuth
 from litellm.types.guardrails import *


+def can_modify_guardrails(team_obj: Optional[LiteLLM_TeamTable]) -> bool:
+    if team_obj is None:
+        return True
+
+    team_metadata = team_obj.metadata or {}
+
+    if team_metadata.get("guardrails", None) is not None and isinstance(
+        team_metadata.get("guardrails"), Dict
+    ):
+        if team_metadata.get("guardrails", {}).get("modify_guardrails", None) is False:
+            return False
+
+    return True
+
+
 async def should_proceed_based_on_metadata(data: dict, guardrail_name: str) -> bool:
    """
    checks if this guardrail should be applied to this call
--- a/litellm/proxy/litellm_pre_call_utils.py
+++ b/litellm/proxy/litellm_pre_call_utils.py
@ -4,7 +4,7 @@ from typing import TYPE_CHECKING, Any, Dict, Optional
 from fastapi import Request

 from litellm._logging import verbose_logger, verbose_proxy_logger
-from litellm.proxy._types import CommonProxyErrors, UserAPIKeyAuth
+from litellm.proxy._types import CommonProxyErrors, TeamCallbackMetadata, UserAPIKeyAuth
 from litellm.types.utils import SupportedCacheControls

 if TYPE_CHECKING:
@ -39,6 +39,9 @@ def _get_metadata_variable_name(request: Request) -> str:
    """
    if "thread" in request.url.path or "assistant" in request.url.path:
        return "litellm_metadata"
+    if "/v1/messages" in request.url.path:
+        # anthropic API has a field called metadata
+        return "litellm_metadata"
    else:
        return "metadata"

@ -207,6 +210,32 @@ async def add_litellm_data_to_request(
                **data,
            }  # add the team-specific configs to the completion call

+    # Team Callbacks controls
+    if user_api_key_dict.team_metadata is not None:
+        team_metadata = user_api_key_dict.team_metadata
+        if "callback_settings" in team_metadata:
+            callback_settings = team_metadata.get("callback_settings", None) or {}
+            callback_settings_obj = TeamCallbackMetadata(**callback_settings)
+            verbose_proxy_logger.debug(
+                "Team callback settings activated: %s", callback_settings_obj
+            )
+            """
+            callback_settings = {
+              {
+                'callback_vars': {'langfuse_public_key': 'pk', 'langfuse_secret_key': 'sk_'}, 
+                'failure_callback': [], 
+                'success_callback': ['langfuse', 'langfuse']
+            }
+            }
+            """
+            data["success_callback"] = callback_settings_obj.success_callback
+            data["failure_callback"] = callback_settings_obj.failure_callback
+
+            if callback_settings_obj.callback_vars is not None:
+                # unpack callback_vars in data
+                for k, v in callback_settings_obj.callback_vars.items():
+                    data[k] = v
+
    return data


--- a/litellm/proxy/management_endpoints/key_management_endpoints.py
+++ b/litellm/proxy/management_endpoints/key_management_endpoints.py
@ -333,6 +333,13 @@ async def update_key_fn(
            expires = datetime.now(timezone.utc) + timedelta(seconds=duration_s)
            non_default_values["expires"] = expires

+        if "budget_duration" in non_default_values:
+            duration_s = _duration_in_seconds(
+                duration=non_default_values["budget_duration"]
+            )
+            key_reset_at = datetime.now(timezone.utc) + timedelta(seconds=duration_s)
+            non_default_values["budget_reset_at"] = key_reset_at
+
        response = await prisma_client.update_data(
            token=key, data={**non_default_values, "token": key}
        )
--- a/litellm/proxy/management_endpoints/team_callback_endpoints.py
+++ b/litellm/proxy/management_endpoints/team_callback_endpoints.py
@ -0,0 +1,364 @@
+"""
+Endpoints to control callbacks per team
+
+Use this when each team should control its own callbacks
+"""
+
+import asyncio
+import copy
+import json
+import traceback
+import uuid
+from datetime import datetime, timedelta, timezone
+from typing import List, Optional
+
+import fastapi
+from fastapi import APIRouter, Depends, Header, HTTPException, Request, status
+
+import litellm
+from litellm._logging import verbose_proxy_logger
+from litellm.proxy._types import (
+    AddTeamCallback,
+    LiteLLM_TeamTable,
+    ProxyErrorTypes,
+    ProxyException,
+    TeamCallbackMetadata,
+    UserAPIKeyAuth,
+)
+from litellm.proxy.auth.user_api_key_auth import user_api_key_auth
+from litellm.proxy.management_helpers.utils import (
+    add_new_member,
+    management_endpoint_wrapper,
+)
+
+router = APIRouter()
+
+
+@router.post(
+    "/team/{team_id:path}/callback",
+    tags=["team management"],
+    dependencies=[Depends(user_api_key_auth)],
+)
+@management_endpoint_wrapper
+async def add_team_callbacks(
+    data: AddTeamCallback,
+    http_request: Request,
+    team_id: str,
+    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
+    litellm_changed_by: Optional[str] = Header(
+        None,
+        description="The litellm-changed-by header enables tracking of actions performed by authorized users on behalf of other users, providing an audit trail for accountability",
+    ),
+):
+    """
+    Add a success/failure callback to a team
+
+    Use this if if you want different teams to have different success/failure callbacks
+
+    Example curl:
+    ```
+    curl -X POST 'http:/localhost:4000/team/dbe2f686-a686-4896-864a-4c3924458709/callback' \
+        -H 'Content-Type: application/json' \
+        -H 'Authorization: Bearer sk-1234' \
+        -d '{
+        "callback_name": "langfuse",
+        "callback_type": "success",
+        "callback_vars": {"langfuse_public_key": "pk-lf-xxxx1", "langfuse_secret_key": "sk-xxxxx"}
+        
+    }'
+    ```
+
+    This means for the team where team_id = dbe2f686-a686-4896-864a-4c3924458709, all LLM calls will be logged to langfuse using the public key pk-lf-xxxx1 and the secret key sk-xxxxx
+
+    """
+    try:
+        from litellm.proxy.proxy_server import (
+            _duration_in_seconds,
+            create_audit_log_for_update,
+            litellm_proxy_admin_name,
+            prisma_client,
+        )
+
+        if prisma_client is None:
+            raise HTTPException(status_code=500, detail={"error": "No db connected"})
+
+        # Check if team_id exists already
+        _existing_team = await prisma_client.get_data(
+            team_id=team_id, table_name="team", query_type="find_unique"
+        )
+        if _existing_team is None:
+            raise HTTPException(
+                status_code=400,
+                detail={
+                    "error": f"Team id = {team_id} does not exist. Please use a different team id."
+                },
+            )
+
+        # store team callback settings in metadata
+        team_metadata = _existing_team.metadata
+        team_callback_settings = team_metadata.get("callback_settings", {})
+        # expect callback settings to be
+        team_callback_settings_obj = TeamCallbackMetadata(**team_callback_settings)
+        if data.callback_type == "success":
+            if team_callback_settings_obj.success_callback is None:
+                team_callback_settings_obj.success_callback = []
+
+            if data.callback_name in team_callback_settings_obj.success_callback:
+                raise ProxyException(
+                    message=f"callback_name = {data.callback_name} already exists in failure_callback, for team_id = {team_id}. \n Existing failure_callback = {team_callback_settings_obj.success_callback}",
+                    code=status.HTTP_400_BAD_REQUEST,
+                    type=ProxyErrorTypes.bad_request_error,
+                    param="callback_name",
+                )
+
+            team_callback_settings_obj.success_callback.append(data.callback_name)
+        elif data.callback_type == "failure":
+            if team_callback_settings_obj.failure_callback is None:
+                team_callback_settings_obj.failure_callback = []
+
+            if data.callback_name in team_callback_settings_obj.failure_callback:
+                raise ProxyException(
+                    message=f"callback_name = {data.callback_name} already exists in failure_callback, for team_id = {team_id}. \n Existing failure_callback = {team_callback_settings_obj.failure_callback}",
+                    code=status.HTTP_400_BAD_REQUEST,
+                    type=ProxyErrorTypes.bad_request_error,
+                    param="callback_name",
+                )
+            team_callback_settings_obj.failure_callback.append(data.callback_name)
+        elif data.callback_type == "success_and_failure":
+            if team_callback_settings_obj.success_callback is None:
+                team_callback_settings_obj.success_callback = []
+            if team_callback_settings_obj.failure_callback is None:
+                team_callback_settings_obj.failure_callback = []
+            if data.callback_name in team_callback_settings_obj.success_callback:
+                raise ProxyException(
+                    message=f"callback_name = {data.callback_name} already exists in success_callback, for team_id = {team_id}. \n Existing success_callback = {team_callback_settings_obj.success_callback}",
+                    code=status.HTTP_400_BAD_REQUEST,
+                    type=ProxyErrorTypes.bad_request_error,
+                    param="callback_name",
+                )
+
+            if data.callback_name in team_callback_settings_obj.failure_callback:
+                raise ProxyException(
+                    message=f"callback_name = {data.callback_name} already exists in failure_callback, for team_id = {team_id}. \n Existing failure_callback = {team_callback_settings_obj.failure_callback}",
+                    code=status.HTTP_400_BAD_REQUEST,
+                    type=ProxyErrorTypes.bad_request_error,
+                    param="callback_name",
+                )
+
+            team_callback_settings_obj.success_callback.append(data.callback_name)
+            team_callback_settings_obj.failure_callback.append(data.callback_name)
+        for var, value in data.callback_vars.items():
+            if team_callback_settings_obj.callback_vars is None:
+                team_callback_settings_obj.callback_vars = {}
+            team_callback_settings_obj.callback_vars[var] = value
+
+        team_callback_settings_obj_dict = team_callback_settings_obj.model_dump()
+
+        team_metadata["callback_settings"] = team_callback_settings_obj_dict
+        team_metadata_json = json.dumps(team_metadata)  # update team_metadata
+
+        new_team_row = await prisma_client.db.litellm_teamtable.update(
+            where={"team_id": team_id}, data={"metadata": team_metadata_json}  # type: ignore
+        )
+
+        return {
+            "status": "success",
+            "data": new_team_row,
+        }
+
+    except Exception as e:
+        verbose_proxy_logger.error(
+            "litellm.proxy.proxy_server.add_team_callbacks(): Exception occured - {}".format(
+                str(e)
+            )
+        )
+        verbose_proxy_logger.debug(traceback.format_exc())
+        if isinstance(e, HTTPException):
+            raise ProxyException(
+                message=getattr(e, "detail", f"Internal Server Error({str(e)})"),
+                type=ProxyErrorTypes.internal_server_error.value,
+                param=getattr(e, "param", "None"),
+                code=getattr(e, "status_code", status.HTTP_500_INTERNAL_SERVER_ERROR),
+            )
+        elif isinstance(e, ProxyException):
+            raise e
+        raise ProxyException(
+            message="Internal Server Error, " + str(e),
+            type=ProxyErrorTypes.internal_server_error.value,
+            param=getattr(e, "param", "None"),
+            code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+        )
+
+
+@router.post(
+    "/team/{team_id}/disable_logging",
+    tags=["team management"],
+    dependencies=[Depends(user_api_key_auth)],
+)
+@management_endpoint_wrapper
+async def disable_team_logging(
+    http_request: Request,
+    team_id: str,
+    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
+):
+    try:
+        from litellm.proxy.proxy_server import prisma_client
+
+        if prisma_client is None:
+            raise HTTPException(status_code=500, detail={"error": "No db connected"})
+
+        # Check if team exists
+        _existing_team = await prisma_client.get_data(
+            team_id=team_id, table_name="team", query_type="find_unique"
+        )
+        if _existing_team is None:
+            raise HTTPException(
+                status_code=404,
+                detail={"error": f"Team id = {team_id} does not exist."},
+            )
+
+        # Update team metadata to disable logging
+        team_metadata = _existing_team.metadata
+        team_callback_settings = team_metadata.get("callback_settings", {})
+        team_callback_settings_obj = TeamCallbackMetadata(**team_callback_settings)
+
+        # Reset callbacks
+        team_callback_settings_obj.success_callback = []
+        team_callback_settings_obj.failure_callback = []
+
+        # Update metadata
+        team_metadata["callback_settings"] = team_callback_settings_obj.model_dump()
+        team_metadata_json = json.dumps(team_metadata)
+
+        # Update team in database
+        updated_team = await prisma_client.db.litellm_teamtable.update(
+            where={"team_id": team_id}, data={"metadata": team_metadata_json}  # type: ignore
+        )
+
+        if updated_team is None:
+            raise HTTPException(
+                status_code=404,
+                detail={
+                    "error": f"Team id = {team_id} does not exist. Error updating team logging"
+                },
+            )
+
+        return {
+            "status": "success",
+            "message": f"Logging disabled for team {team_id}",
+            "data": {
+                "team_id": updated_team.team_id,
+                "success_callbacks": [],
+                "failure_callbacks": [],
+            },
+        }
+
+    except Exception as e:
+        verbose_proxy_logger.error(
+            f"litellm.proxy.proxy_server.disable_team_logging(): Exception occurred - {str(e)}"
+        )
+        verbose_proxy_logger.debug(traceback.format_exc())
+        if isinstance(e, HTTPException):
+            raise ProxyException(
+                message=getattr(e, "detail", f"Internal Server Error({str(e)})"),
+                type=ProxyErrorTypes.internal_server_error.value,
+                param=getattr(e, "param", "None"),
+                code=getattr(e, "status_code", status.HTTP_500_INTERNAL_SERVER_ERROR),
+            )
+        elif isinstance(e, ProxyException):
+            raise e
+        raise ProxyException(
+            message="Internal Server Error, " + str(e),
+            type=ProxyErrorTypes.internal_server_error.value,
+            param=getattr(e, "param", "None"),
+            code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+        )
+
+
+@router.get(
+    "/team/{team_id:path}/callback",
+    tags=["team management"],
+    dependencies=[Depends(user_api_key_auth)],
+)
+@management_endpoint_wrapper
+async def get_team_callbacks(
+    http_request: Request,
+    team_id: str,
+    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
+):
+    """
+    Get the success/failure callbacks and variables for a team
+
+    Example curl:
+    ```
+    curl -X GET 'http://localhost:4000/team/dbe2f686-a686-4896-864a-4c3924458709/callback' \
+        -H 'Authorization: Bearer sk-1234'
+    ```
+
+    This will return the callback settings for the team with id dbe2f686-a686-4896-864a-4c3924458709
+
+    Returns {
+            "status": "success",
+            "data": {
+                "team_id": team_id,
+                "success_callbacks": team_callback_settings_obj.success_callback,
+                "failure_callbacks": team_callback_settings_obj.failure_callback,
+                "callback_vars": team_callback_settings_obj.callback_vars,
+            },
+        }
+    """
+    try:
+        from litellm.proxy.proxy_server import prisma_client
+
+        if prisma_client is None:
+            raise HTTPException(status_code=500, detail={"error": "No db connected"})
+
+        # Check if team_id exists
+        _existing_team = await prisma_client.get_data(
+            team_id=team_id, table_name="team", query_type="find_unique"
+        )
+        if _existing_team is None:
+            raise HTTPException(
+                status_code=404,
+                detail={"error": f"Team id = {team_id} does not exist."},
+            )
+
+        # Retrieve team callback settings from metadata
+        team_metadata = _existing_team.metadata
+        team_callback_settings = team_metadata.get("callback_settings", {})
+
+        # Convert to TeamCallbackMetadata object for consistent structure
+        team_callback_settings_obj = TeamCallbackMetadata(**team_callback_settings)
+
+        return {
+            "status": "success",
+            "data": {
+                "team_id": team_id,
+                "success_callbacks": team_callback_settings_obj.success_callback,
+                "failure_callbacks": team_callback_settings_obj.failure_callback,
+                "callback_vars": team_callback_settings_obj.callback_vars,
+            },
+        }
+
+    except Exception as e:
+        verbose_proxy_logger.error(
+            "litellm.proxy.proxy_server.get_team_callbacks(): Exception occurred - {}".format(
+                str(e)
+            )
+        )
+        verbose_proxy_logger.debug(traceback.format_exc())
+        if isinstance(e, HTTPException):
+            raise ProxyException(
+                message=getattr(e, "detail", f"Internal Server Error({str(e)})"),
+                type=ProxyErrorTypes.internal_server_error.value,
+                param=getattr(e, "param", "None"),
+                code=getattr(e, "status_code", status.HTTP_500_INTERNAL_SERVER_ERROR),
+            )
+        elif isinstance(e, ProxyException):
+            raise e
+        raise ProxyException(
+            message="Internal Server Error, " + str(e),
+            type=ProxyErrorTypes.internal_server_error.value,
+            param=getattr(e, "param", "None"),
+            code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+        )
--- a/litellm/proxy/management_endpoints/team_endpoints.py
+++ b/litellm/proxy/management_endpoints/team_endpoints.py
@ -363,6 +363,7 @@ async def update_team(
        # set the budget_reset_at in DB
        updated_kv["budget_reset_at"] = reset_at

+    updated_kv = prisma_client.jsonify_object(data=updated_kv)
    team_row: Optional[
        LiteLLM_TeamTable
    ] = await prisma_client.db.litellm_teamtable.update(
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@ -1,10 +1,21 @@
 model_list:
+  - model_name: gpt-4
+    litellm_params:
+      model: openai/fake
+      api_key: fake-key
+      api_base: https://exampleopenaiendpoint-production.up.railway.app/
  - model_name: fireworks-llama-v3-70b-instruct
    litellm_params:
      model: fireworks_ai/accounts/fireworks/models/llama-v3-70b-instruct
-      api_key: "os.environ/FIREWORKS_AI_API_KEY"
-
-router_settings:
-  enable_tag_filtering: True # 👈 Key Change
+      api_key: "os.environ/FIREWORKS"
 general_settings: 
  master_key: sk-1234
+  alerting: ["slack"]
+  alerting_threshold: 0.0001
+  alert_to_webhook_url: {
+    "llm_too_slow": "https://hooks.slack.com/services/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",
+    "llm_requests_hanging": "https://hooks.slack.com/services/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",
+  }
+
+litellm_settings:
+  success_callback: ["langfuse"]
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -170,6 +170,9 @@ from litellm.proxy.management_endpoints.key_management_endpoints import (
 from litellm.proxy.management_endpoints.key_management_endpoints import (
    router as key_management_router,
 )
+from litellm.proxy.management_endpoints.team_callback_endpoints import (
+    router as team_callback_router,
+)
 from litellm.proxy.management_endpoints.team_endpoints import router as team_router
 from litellm.proxy.openai_files_endpoints.files_endpoints import (
    router as openai_files_router,
@ -654,7 +657,11 @@ async def _PROXY_track_cost_callback(
    global prisma_client, custom_db_client
    try:
        # check if it has collected an entire stream response
-        verbose_proxy_logger.debug("Proxy: In track_cost_callback for: %s", kwargs)
+        verbose_proxy_logger.debug(
+            "Proxy: In track_cost_callback for: kwargs=%s and completion_response: %s",
+            kwargs,
+            completion_response,
+        )
        verbose_proxy_logger.debug(
            f"kwargs stream: {kwargs.get('stream', None)} + complete streaming response: {kwargs.get('complete_streaming_response', None)}"
        )
@ -1620,6 +1627,7 @@ class ProxyConfig:
                alerting=general_settings.get("alerting", None),
                alerting_threshold=general_settings.get("alerting_threshold", 600),
                alert_types=general_settings.get("alert_types", None),
+                alert_to_webhook_url=general_settings.get("alert_to_webhook_url", None),
                alerting_args=general_settings.get("alerting_args", None),
                redis_cache=redis_usage_cache,
            )
@ -2905,6 +2913,7 @@ async def chat_completion(
        fastest_response_batch_completion = hidden_params.get(
            "fastest_response_batch_completion", None
        )
+        additional_headers: dict = hidden_params.get("additional_headers", {}) or {}

        # Post Call Processing
        if llm_router is not None:
@ -2927,6 +2936,7 @@ async def chat_completion(
                response_cost=response_cost,
                model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
                fastest_response_batch_completion=fastest_response_batch_completion,
+                **additional_headers,
            )
            selected_data_generator = select_data_generator(
                response=response,
@ -2944,8 +2954,10 @@ async def chat_completion(
            user_api_key_dict=user_api_key_dict, response=response
        )

-        hidden_params = getattr(response, "_hidden_params", {}) or {}
-        additional_headers: dict = hidden_params.get("additional_headers", {}) or {}
+        hidden_params = (
+            getattr(response, "_hidden_params", {}) or {}
+        )  # get any updated response headers
+        additional_headers = hidden_params.get("additional_headers", {}) or {}

        fastapi_response.headers.update(
            get_custom_headers(
@ -9457,3 +9469,4 @@ app.include_router(analytics_router)
 app.include_router(debugging_endpoints_router)
 app.include_router(ui_crud_endpoints_router)
 app.include_router(openai_files_router)
+app.include_router(team_callback_router)
--- a/litellm/proxy/schema.prisma
+++ b/litellm/proxy/schema.prisma
@ -183,12 +183,12 @@ model LiteLLM_SpendLogs {
  model               String   @default("")
  model_id            String?   @default("") // the model id stored in proxy model db
  model_group         String?   @default("") // public model_name / model_group
-  api_base            String   @default("")
-  user                String   @default("")
-  metadata            Json     @default("{}")
-  cache_hit           String   @default("")
-  cache_key           String   @default("")
-  request_tags        Json     @default("[]")
+  api_base            String?   @default("")
+  user                String?   @default("")
+  metadata            Json?     @default("{}")
+  cache_hit           String?   @default("")
+  cache_key           String?   @default("")
+  request_tags        Json?     @default("[]")
  team_id             String? 
  end_user            String?
  requester_ip_address String?
--- a/litellm/proxy/tests/test_anthropic_sdk.py
+++ b/litellm/proxy/tests/test_anthropic_sdk.py
@ -0,0 +1,22 @@
+import os
+
+from anthropic import Anthropic
+
+client = Anthropic(
+    # This is the default and can be omitted
+    base_url="http://localhost:4000",
+    # this is a litellm proxy key :) - not a real anthropic key
+    api_key="sk-s4xN1IiLTCytwtZFJaYQrA",
+)
+
+message = client.messages.create(
+    max_tokens=1024,
+    messages=[
+        {
+            "role": "user",
+            "content": "Hello, Claude",
+        }
+    ],
+    model="claude-3-opus-20240229",
+)
+print(message.content)
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@ -25,7 +25,7 @@ from typing_extensions import overload
 import litellm
 import litellm.litellm_core_utils
 import litellm.litellm_core_utils.litellm_logging
-from litellm import EmbeddingResponse, ImageResponse, ModelResponse
+from litellm import EmbeddingResponse, ImageResponse, ModelResponse, get_litellm_params
 from litellm._logging import verbose_proxy_logger
 from litellm._service_logger import ServiceLogging, ServiceTypes
 from litellm.caching import DualCache, RedisCache
@ -50,7 +50,7 @@ from litellm.proxy.hooks.max_budget_limiter import _PROXY_MaxBudgetLimiter
 from litellm.proxy.hooks.parallel_request_limiter import (
    _PROXY_MaxParallelRequestsHandler,
 )
-from litellm.types.utils import CallTypes
+from litellm.types.utils import CallTypes, LoggedLiteLLMParams

 if TYPE_CHECKING:
    from opentelemetry.trace import Span as _Span
@ -188,6 +188,7 @@ class ProxyLogging:
            "new_model_added",
            "outage_alerts",
        ]
+        self.alert_to_webhook_url: Optional[dict] = None
        self.slack_alerting_instance: SlackAlerting = SlackAlerting(
            alerting_threshold=self.alerting_threshold,
            alerting=self.alerting,
@ -202,6 +203,7 @@ class ProxyLogging:
        redis_cache: Optional[RedisCache] = None,
        alert_types: Optional[List[AlertType]] = None,
        alerting_args: Optional[dict] = None,
+        alert_to_webhook_url: Optional[dict] = None,
    ):
        updated_slack_alerting: bool = False
        if alerting is not None:
@ -213,6 +215,9 @@ class ProxyLogging:
        if alert_types is not None:
            self.alert_types = alert_types
            updated_slack_alerting = True
+        if alert_to_webhook_url is not None:
+            self.alert_to_webhook_url = alert_to_webhook_url
+            updated_slack_alerting = True

        if updated_slack_alerting is True:
            self.slack_alerting_instance.update_values(
@ -220,6 +225,7 @@ class ProxyLogging:
                alerting_threshold=self.alerting_threshold,
                alert_types=self.alert_types,
                alerting_args=alerting_args,
+                alert_to_webhook_url=self.alert_to_webhook_url,
            )

            if (
@ -602,14 +608,20 @@ class ProxyLogging:
            if litellm_logging_obj is not None:
                ## UPDATE LOGGING INPUT
                _optional_params = {}
+                _litellm_params = {}
+
+                litellm_param_keys = LoggedLiteLLMParams.__annotations__.keys()
                for k, v in request_data.items():
-                    if k != "model" and k != "user" and k != "litellm_params":
+                    if k in litellm_param_keys:
+                        _litellm_params[k] = v
+                    elif k != "model" and k != "user":
                        _optional_params[k] = v
+
                litellm_logging_obj.update_environment_variables(
                    model=request_data.get("model", ""),
                    user=request_data.get("user", ""),
                    optional_params=_optional_params,
-                    litellm_params=request_data.get("litellm_params", {}),
+                    litellm_params=_litellm_params,
                )

                input: Union[list, str, dict] = ""
@ -832,6 +844,30 @@ class PrismaClient:

        If the view doesn't exist, one will be created.
        """
+
+        # Check to see if all of the necessary views exist and if they do, simply return
+        # This is more efficient because it lets us check for all views in one
+        # query instead of multiple queries.
+        try:
+            ret = await self.db.query_raw(
+                """
+                    SELECT SUM(1) FROM pg_views
+                    WHERE schemaname = 'public' AND viewname IN (
+                        'LiteLLM_VerificationTokenView',
+                        'MonthlyGlobalSpend',
+                        'Last30dKeysBySpend',
+                        'Last30dModelsBySpend',
+                        'MonthlyGlobalSpendPerKey',
+                        'Last30dTopEndUsersSpend'
+                    )
+                    """
+            )
+            if ret[0]['sum'] == 6:
+                print("All necessary views exist!")  # noqa
+                return
+        except Exception:
+            pass
+
        try:
            # Try to select one row from the view
            await self.db.query_raw(
@ -1313,8 +1349,10 @@ class PrismaClient:
                    t.tpm_limit AS team_tpm_limit,
                    t.rpm_limit AS team_rpm_limit,
                    t.models AS team_models,
+                    t.metadata AS team_metadata,
                    t.blocked AS team_blocked,
                    t.team_alias AS team_alias,
+                    t.metadata AS team_metadata,
                    tm.spend AS team_member_spend,
                    m.aliases as team_model_aliases
                    FROM "LiteLLM_VerificationToken" AS v
--- a/litellm/tests/test_amazing_vertex_completion.py
+++ b/litellm/tests/test_amazing_vertex_completion.py
@ -895,6 +895,52 @@ async def test_gemini_pro_function_calling_httpx(model, sync_mode):
            pytest.fail("An unexpected exception occurred - {}".format(str(e)))


+from litellm.tests.test_completion import response_format_tests
+
+
+@pytest.mark.parametrize(
+    "model", ["vertex_ai/meta/llama3-405b-instruct-maas"]
+)  # "vertex_ai",
+@pytest.mark.parametrize("sync_mode", [True, False])  # "vertex_ai",
+@pytest.mark.asyncio
+async def test_llama_3_httpx(model, sync_mode):
+    try:
+        load_vertex_ai_credentials()
+        litellm.set_verbose = True
+
+        messages = [
+            {
+                "role": "system",
+                "content": "Your name is Litellm Bot, you are a helpful assistant",
+            },
+            # User asks for their name and weather in San Francisco
+            {
+                "role": "user",
+                "content": "Hello, what is your name and can you tell me the weather?",
+            },
+        ]
+
+        data = {
+            "model": model,
+            "messages": messages,
+        }
+        if sync_mode:
+            response = litellm.completion(**data)
+        else:
+            response = await litellm.acompletion(**data)
+
+        response_format_tests(response=response)
+
+        print(f"response: {response}")
+    except litellm.RateLimitError as e:
+        pass
+    except Exception as e:
+        if "429 Quota exceeded" in str(e):
+            pass
+        else:
+            pytest.fail("An unexpected exception occurred - {}".format(str(e)))
+
+
 def vertex_httpx_mock_reject_prompt_post(*args, **kwargs):
    mock_response = MagicMock()
    mock_response.status_code = 200
--- a/litellm/tests/test_anthropic_completion.py
+++ b/litellm/tests/test_anthropic_completion.py
@ -48,6 +48,42 @@ def test_anthropic_completion_input_translation():
    ]


+def test_anthropic_completion_input_translation_with_metadata():
+    """
+    Tests that cost tracking works as expected with LiteLLM Proxy
+
+    LiteLLM Proxy will insert litellm_metadata for anthropic endpoints to track user_api_key and user_api_key_team_id
+
+    This test ensures that the `litellm_metadata` is not present in the translated input
+    It ensures that `litellm.acompletion()` will receieve metadata which is a litellm specific param
+    """
+    data = {
+        "model": "gpt-3.5-turbo",
+        "messages": [{"role": "user", "content": "Hey, how's it going?"}],
+        "litellm_metadata": {
+            "user_api_key": "88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",
+            "user_api_key_alias": None,
+            "user_api_end_user_max_budget": None,
+            "litellm_api_version": "1.40.19",
+            "global_max_parallel_requests": None,
+            "user_api_key_user_id": "default_user_id",
+            "user_api_key_org_id": None,
+            "user_api_key_team_id": None,
+            "user_api_key_team_alias": None,
+            "user_api_key_team_max_budget": None,
+            "user_api_key_team_spend": None,
+            "user_api_key_spend": 0.0,
+            "user_api_key_max_budget": None,
+            "user_api_key_metadata": {},
+        },
+    }
+    translated_input = anthropic_adapter.translate_completion_input_params(kwargs=data)
+
+    assert "litellm_metadata" not in translated_input
+    assert "metadata" in translated_input
+    assert translated_input["metadata"] == data["litellm_metadata"]
+
+
 def test_anthropic_completion_e2e():
    litellm.set_verbose = True

--- a/litellm/tests/test_arize_ai.py
+++ b/litellm/tests/test_arize_ai.py
@ -0,0 +1,29 @@
+import asyncio
+import logging
+import os
+import time
+
+import pytest
+from dotenv import load_dotenv
+from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
+
+import litellm
+from litellm._logging import verbose_logger
+from litellm.integrations.opentelemetry import OpenTelemetry, OpenTelemetryConfig
+
+load_dotenv()
+import logging
+
+
+@pytest.mark.asyncio()
+async def test_async_otel_callback():
+    litellm.set_verbose = True
+    litellm.success_callback = ["arize"]
+
+    await litellm.acompletion(
+        model="gpt-3.5-turbo",
+        messages=[{"role": "user", "content": "hi test from local arize"}],
+        mock_response="hello",
+        temperature=0.1,
+        user="OTEL_USER",
+    )
--- a/litellm/tests/test_bad_params.py
+++ b/litellm/tests/test_bad_params.py
@ -2,18 +2,19 @@
 #    This tests chaos monkeys - if random parts of the system are broken / things aren't sent correctly - what happens.
 #    Expect to add more edge cases to this over time.

-import sys, os
+import os
+import sys
 import traceback
+
 import pytest

 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import litellm
-from litellm import embedding, completion
+from litellm import completion, embedding
 from litellm.utils import Message

-
 # litellm.set_verbose = True
 user_message = "Hello, how are you?"
 messages = [{"content": user_message, "role": "user"}]
@ -74,6 +75,8 @@ def test_completion_invalid_param_cohere():
        response = completion(model="command-nightly", messages=messages, seed=12)
        pytest.fail(f"This should have failed cohere does not support `seed` parameter")
    except Exception as e:
+        assert isinstance(e, litellm.UnsupportedParamsError)
+        print("got an exception=", str(e))
        if " cohere does not support parameters: {'seed': 12}" in str(e):
            pass
        else:
--- a/litellm/tests/test_braintrust.py
+++ b/litellm/tests/test_braintrust.py
@ -0,0 +1,53 @@
+# What is this?
+## This tests the braintrust integration
+
+import asyncio
+import os
+import random
+import sys
+import time
+import traceback
+from datetime import datetime
+
+from dotenv import load_dotenv
+from fastapi import Request
+
+load_dotenv()
+import os
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
+import asyncio
+import logging
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+import litellm
+from litellm.llms.custom_httpx.http_handler import HTTPHandler
+
+
+def test_braintrust_logging():
+    import litellm
+
+    http_client = HTTPHandler()
+
+    setattr(
+        litellm.integrations.braintrust_logging,
+        "global_braintrust_sync_http_handler",
+        http_client,
+    )
+
+    with patch.object(http_client, "post", new=MagicMock()) as mock_client:
+
+        # set braintrust as a callback, litellm will send the data to braintrust
+        litellm.callbacks = ["braintrust"]
+
+        # openai call
+        response = litellm.completion(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}],
+        )
+
+        mock_client.assert_called()
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -346,7 +346,7 @@ def test_completion_claude_3_empty_response():
    messages = [
        {
            "role": "system",
-            "content": "You are 2twNLGfqk4GMOn3ffp4p.",
+            "content": [{"type": "text", "text": "You are 2twNLGfqk4GMOn3ffp4p."}],
        },
        {"role": "user", "content": "Hi gm!", "name": "ishaan"},
        {"role": "assistant", "content": "Good morning! How are you doing today?"},
@ -1364,6 +1364,12 @@ def test_completion_openai_response_headers():
    print("response_headers=", response._response_headers)
    assert response._response_headers is not None
    assert "x-ratelimit-remaining-tokens" in response._response_headers
+    assert isinstance(
+        response._hidden_params["additional_headers"][
+            "llm_provider-x-ratelimit-remaining-requests"
+        ],
+        str,
+    )

    # /chat/completion - with streaming

@ -1376,6 +1382,12 @@ def test_completion_openai_response_headers():
    print("streaming response_headers=", response_headers)
    assert response_headers is not None
    assert "x-ratelimit-remaining-tokens" in response_headers
+    assert isinstance(
+        response._hidden_params["additional_headers"][
+            "llm_provider-x-ratelimit-remaining-requests"
+        ],
+        str,
+    )

    for chunk in streaming_response:
        print("chunk=", chunk)
@ -1390,6 +1402,12 @@ def test_completion_openai_response_headers():
    print("embedding_response_headers=", embedding_response_headers)
    assert embedding_response_headers is not None
    assert "x-ratelimit-remaining-tokens" in embedding_response_headers
+    assert isinstance(
+        response._hidden_params["additional_headers"][
+            "llm_provider-x-ratelimit-remaining-requests"
+        ],
+        str,
+    )

    litellm.return_response_headers = False

@ -2542,6 +2560,71 @@ def test_completion_anyscale_with_functions():
 # test_completion_anyscale_with_functions()


+def test_completion_azure_extra_headers():
+    # this tests if we can pass api_key to completion, when it's not in the env.
+    # DO NOT REMOVE THIS TEST. No MATTER WHAT Happens!
+    # If you want to remove it, speak to Ishaan!
+    # Ishaan will be very disappointed if this test is removed -> this is a standard way to pass api_key + the router + proxy use this
+    from httpx import Client
+    from openai import AzureOpenAI
+
+    from litellm.llms.custom_httpx.httpx_handler import HTTPHandler
+
+    http_client = Client()
+
+    with patch.object(http_client, "send", new=MagicMock()) as mock_client:
+        litellm.client_session = http_client
+        try:
+            response = completion(
+                model="azure/chatgpt-v-2",
+                messages=messages,
+                api_base=os.getenv("AZURE_API_BASE"),
+                api_version="2023-07-01-preview",
+                api_key=os.getenv("AZURE_API_KEY"),
+                extra_headers={
+                    "Authorization": "my-bad-key",
+                    "Ocp-Apim-Subscription-Key": "hello-world-testing",
+                },
+            )
+            print(response)
+            pytest.fail("Expected this to fail")
+        except Exception as e:
+            pass
+
+        mock_client.assert_called()
+
+        print(f"mock_client.call_args: {mock_client.call_args}")
+        request = mock_client.call_args[0][0]
+        print(request.method)  # This will print 'POST'
+        print(request.url)  # This will print the full URL
+        print(request.headers)  # This will print the full URL
+        auth_header = request.headers.get("Authorization")
+        apim_key = request.headers.get("Ocp-Apim-Subscription-Key")
+        print(auth_header)
+        assert auth_header == "my-bad-key"
+        assert apim_key == "hello-world-testing"
+
+
+def test_completion_azure_ad_token():
+    # this tests if we can pass api_key to completion, when it's not in the env.
+    # DO NOT REMOVE THIS TEST. No MATTER WHAT Happens!
+    # If you want to remove it, speak to Ishaan!
+    # Ishaan will be very disappointed if this test is removed -> this is a standard way to pass api_key + the router + proxy use this
+    from httpx import Client
+    from openai import AzureOpenAI
+
+    from litellm import completion
+    from litellm.llms.custom_httpx.httpx_handler import HTTPHandler
+
+    response = completion(
+        model="azure/chatgpt-v-2",
+        messages=messages,
+        # api_key="my-fake-ad-token",
+        azure_ad_token=os.getenv("AZURE_API_KEY"),
+    )
+    print(response)
+
+
 def test_completion_azure_key_completion_arg():
    # this tests if we can pass api_key to completion, when it's not in the env.
    # DO NOT REMOVE THIS TEST. No MATTER WHAT Happens!
--- a/litellm/tests/test_completion_cost.py
+++ b/litellm/tests/test_completion_cost.py
@ -881,6 +881,7 @@ def test_completion_azure_ai():
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
 async def test_completion_cost_hidden_params(sync_mode):
+    litellm.return_response_headers = True
    if sync_mode:
        response = litellm.completion(
            model="gpt-3.5-turbo",
--- a/litellm/tests/test_custom_callback_input.py
+++ b/litellm/tests/test_custom_callback_input.py
@ -235,6 +235,7 @@ class CompletionCustomHandler(

            assert isinstance(kwargs["optional_params"], dict)
            assert isinstance(kwargs["litellm_params"], dict)
+            assert isinstance(kwargs["litellm_params"]["metadata"], Optional[dict])
            assert isinstance(kwargs["start_time"], (datetime, type(None)))
            assert isinstance(kwargs["stream"], bool)
            assert isinstance(kwargs["user"], (str, type(None)))
--- a/litellm/tests/test_embedding.py
+++ b/litellm/tests/test_embedding.py
@ -197,6 +197,29 @@ def test_openai_azure_embedding():
        pytest.fail(f"Error occurred: {e}")


+@pytest.mark.skipif(
+    os.environ.get("CIRCLE_OIDC_TOKEN") is None,
+    reason="Cannot run without being in CircleCI Runner",
+)
+def test_openai_azure_embedding_with_oidc_and_cf():
+    # TODO: Switch to our own Azure account, currently using ai.moda's account
+    os.environ["AZURE_TENANT_ID"] = "17c0a27a-1246-4aa1-a3b6-d294e80e783c"
+    os.environ["AZURE_CLIENT_ID"] = "4faf5422-b2bd-45e8-a6d7-46543a38acd0"
+
+    try:
+        response = embedding(
+            model="azure/text-embedding-ada-002",
+            input=["Hello"],
+            azure_ad_token="oidc/circleci/",
+            api_base="https://eastus2-litellm.openai.azure.com/",
+            api_version="2024-06-01",
+        )
+        print(response)
+
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+
 def test_openai_azure_embedding_optional_arg(mocker):
    mocked_create_embeddings = mocker.patch.object(
        openai.resources.embeddings.Embeddings,
@ -650,3 +673,17 @@ async def test_databricks_embeddings(sync_mode):
 #     print(response)

 # local_proxy_embeddings()
+
+
+def test_embedding_azure_ad_token():
+    # this tests if we can pass api_key to completion, when it's not in the env.
+    # DO NOT REMOVE THIS TEST. No MATTER WHAT Happens!
+    # If you want to remove it, speak to Ishaan!
+    # Ishaan will be very disappointed if this test is removed -> this is a standard way to pass api_key + the router + proxy use this
+
+    response = embedding(
+        model="azure/azure-embedding-model",
+        input=["good morning from litellm"],
+        azure_ad_token=os.getenv("AZURE_API_KEY"),
+    )
+    print(response)
--- a/litellm/tests/test_exceptions.py
+++ b/litellm/tests/test_exceptions.py
@ -64,6 +64,30 @@ async def test_content_policy_exception_azure():
        pytest.fail(f"An exception occurred - {str(e)}")


+@pytest.mark.asyncio
+async def test_content_policy_exception_openai():
+    try:
+        # this is ony a test - we needed some way to invoke the exception :(
+        litellm.set_verbose = True
+        response = await litellm.acompletion(
+            model="gpt-3.5-turbo-0613",
+            stream=True,
+            messages=[
+                {"role": "user", "content": "Gimme the lyrics to Don't Stop Me Now"}
+            ],
+        )
+        async for chunk in response:
+            print(chunk)
+    except litellm.ContentPolicyViolationError as e:
+        print("caught a content policy violation error! Passed")
+        print("exception", e)
+        assert e.llm_provider == "openai"
+        pass
+    except Exception as e:
+        print()
+        pytest.fail(f"An exception occurred - {str(e)}")
+
+
 # Test 1: Context Window Errors
@pytest.mark.skip(reason="AWS Suspended Account")
@pytest.mark.parametrize("model", exception_models)
--- a/litellm/tests/test_langsmith.py
+++ b/litellm/tests/test_langsmith.py
@ -36,6 +36,7 @@ async def test_async_langsmith_logging():
            temperature=0.2,
            metadata={
                "id": run_id,
+                "tags": ["tag1", "tag2"],
                "user_api_key": "6eb81e014497d89f3cc1aa9da7c2b37bda6b7fea68e4b710d33d94201e68970c",
                "user_api_key_alias": "ishaans-langmsith-key",
                "user_api_end_user_max_budget": None,
--- a/litellm/tests/test_optional_params.py
+++ b/litellm/tests/test_optional_params.py
@ -128,6 +128,19 @@ def test_azure_ai_mistral_optional_params():
    assert "user" not in optional_params


+def test_vertex_ai_llama_3_optional_params():
+    litellm.vertex_llama3_models = ["meta/llama3-405b-instruct-maas"]
+    litellm.drop_params = True
+    optional_params = get_optional_params(
+        model="meta/llama3-405b-instruct-maas",
+        user="John",
+        custom_llm_provider="vertex_ai",
+        max_tokens=10,
+        temperature=0.2,
+    )
+    assert "user" not in optional_params
+
+
 def test_azure_gpt_optional_params_gpt_vision():
    # for OpenAI, Azure all extra params need to get passed as extra_body to OpenAI python. We assert we actually set extra_body here
    optional_params = litellm.utils.get_optional_params(
--- a/litellm/tests/test_prompt_factory.py
+++ b/litellm/tests/test_prompt_factory.py
@ -212,7 +212,7 @@ def test_convert_url_to_img():
    [
        ("data:image/jpeg;base64,1234", "image/jpeg"),
        ("data:application/pdf;base64,1234", "application/pdf"),
-        ("data:image\/jpeg;base64,1234", "image/jpeg"),
+        (r"data:image\/jpeg;base64,1234", "image/jpeg"),
    ],
 )
 def test_base64_image_input(url, expected_media_type):
--- a/litellm/tests/test_proxy_routes.py
+++ b/litellm/tests/test_proxy_routes.py
@ -19,7 +19,7 @@ import pytest

 import litellm
 from litellm.proxy._types import LiteLLMRoutes
-from litellm.proxy.auth.auth_utils import is_openai_route
+from litellm.proxy.auth.auth_utils import is_llm_api_route
 from litellm.proxy.proxy_server import app

 # Configure logging
@ -77,8 +77,8 @@ def test_routes_on_litellm_proxy():
        ("/v1/non_existent_endpoint", False),
    ],
 )
-def test_is_openai_route(route: str, expected: bool):
-    assert is_openai_route(route) == expected
+def test_is_llm_api_route(route: str, expected: bool):
+    assert is_llm_api_route(route) == expected


 # Test case for routes that are similar but should return False
@ -91,5 +91,10 @@ def test_is_openai_route(route: str, expected: bool):
        "/engines/model/invalid/completions",
    ],
 )
-def test_is_openai_route_similar_but_false(route: str):
-    assert is_openai_route(route) == False
+def test_is_llm_api_route_similar_but_false(route: str):
+    assert is_llm_api_route(route) == False
+
+
+def test_anthropic_api_routes():
+    # allow non proxy admins to call anthropic api routes
+    assert is_llm_api_route(route="/v1/messages") is True
--- a/litellm/tests/test_proxy_server.py
+++ b/litellm/tests/test_proxy_server.py
@ -173,6 +173,63 @@ def test_chat_completion(mock_acompletion, client_no_auth):
        pytest.fail(f"LiteLLM Proxy test failed. Exception - {str(e)}")


+@mock_patch_acompletion()
+@pytest.mark.asyncio
+async def test_team_disable_guardrails(mock_acompletion, client_no_auth):
+    """
+    If team not allowed to turn on/off guardrails
+
+    Raise 403 forbidden error, if request is made by team on `/key/generate` or `/chat/completions`.
+    """
+    import asyncio
+    import json
+    import time
+
+    from fastapi import HTTPException, Request
+    from starlette.datastructures import URL
+
+    from litellm.proxy._types import LiteLLM_TeamTable, ProxyException, UserAPIKeyAuth
+    from litellm.proxy.auth.user_api_key_auth import user_api_key_auth
+    from litellm.proxy.proxy_server import hash_token, user_api_key_cache
+
+    _team_id = "1234"
+    user_key = "sk-12345678"
+
+    valid_token = UserAPIKeyAuth(
+        team_id=_team_id,
+        team_blocked=True,
+        token=hash_token(user_key),
+        last_refreshed_at=time.time(),
+    )
+    await asyncio.sleep(1)
+    team_obj = LiteLLM_TeamTable(
+        team_id=_team_id,
+        blocked=False,
+        last_refreshed_at=time.time(),
+        metadata={"guardrails": {"modify_guardrails": False}},
+    )
+    user_api_key_cache.set_cache(key=hash_token(user_key), value=valid_token)
+    user_api_key_cache.set_cache(key="team_id:{}".format(_team_id), value=team_obj)
+
+    setattr(litellm.proxy.proxy_server, "user_api_key_cache", user_api_key_cache)
+    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
+    setattr(litellm.proxy.proxy_server, "prisma_client", "hello-world")
+
+    request = Request(scope={"type": "http"})
+    request._url = URL(url="/chat/completions")
+
+    body = {"metadata": {"guardrails": {"hide_secrets": False}}}
+    json_bytes = json.dumps(body).encode("utf-8")
+
+    request._body = json_bytes
+
+    try:
+        await user_api_key_auth(request=request, api_key="Bearer " + user_key)
+        pytest.fail("Expected to raise 403 forbidden error.")
+    except ProxyException as e:
+        assert e.code == 403
+
+
 from litellm.tests.test_custom_callback_input import CompletionCustomHandler


--- a/litellm/tests/test_secret_manager.py
+++ b/litellm/tests/test_secret_manager.py
@ -12,6 +12,8 @@ sys.path.insert(
 import pytest
 from litellm import get_secret
 from litellm.proxy.secret_managers.aws_secret_manager import load_aws_secret_manager
+from litellm.llms.azure import get_azure_ad_token_from_oidc
+from litellm.llms.bedrock_httpx import BedrockLLM


@pytest.mark.skip(reason="AWS Suspended Account")
@ -60,7 +62,7 @@ def test_oidc_github():
 )
 def test_oidc_circleci():
    secret_val = get_secret(
-        "oidc/circleci/https://bedrock-runtime.us-east-1.amazonaws.com/model/amazon.titan-text-express-v1/invoke"
+        "oidc/circleci/"
    )

    print(f"secret_val: {redact_oidc_signature(secret_val)}")
@ -76,3 +78,38 @@ def test_oidc_circleci_v2():
    )

    print(f"secret_val: {redact_oidc_signature(secret_val)}")
+
+
+@pytest.mark.skipif(
+    os.environ.get("CIRCLE_OIDC_TOKEN") is None,
+    reason="Cannot run without being in CircleCI Runner",
+)
+def test_oidc_circleci_with_azure():
+    # TODO: Switch to our own Azure account, currently using ai.moda's account
+    os.environ["AZURE_TENANT_ID"] = "17c0a27a-1246-4aa1-a3b6-d294e80e783c"
+    os.environ["AZURE_CLIENT_ID"] = "4faf5422-b2bd-45e8-a6d7-46543a38acd0"
+    azure_ad_token = get_azure_ad_token_from_oidc("oidc/circleci/")
+
+    print(f"secret_val: {redact_oidc_signature(azure_ad_token)}")
+
+
+@pytest.mark.skipif(
+    os.environ.get("CIRCLE_OIDC_TOKEN") is None,
+    reason="Cannot run without being in CircleCI Runner",
+)
+def test_oidc_circle_v1_with_amazon():
+    # The purpose of this test is to get logs using the older v1 of the CircleCI OIDC token
+
+    # TODO: This is using ai.moda's IAM role, we should use LiteLLM's IAM role eventually
+    aws_role_name = (
+        "arn:aws:iam::335785316107:role/litellm-github-unit-tests-circleci-v1-assume-only"
+    )
+    aws_web_identity_token = "oidc/circleci/"
+
+    bllm = BedrockLLM()
+    creds = bllm.get_credentials(
+        aws_region_name="ca-west-1",
+        aws_web_identity_token=aws_web_identity_token,
+        aws_role_name=aws_role_name,
+        aws_session_name="assume-v1-session",
+    )
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@ -1988,25 +1988,30 @@ async def test_hf_completion_tgi_stream():

 # test on openai completion call
 def test_openai_chat_completion_call():
-    try:
-        litellm.set_verbose = False
-        print(f"making openai chat completion call")
-        response = completion(model="gpt-3.5-turbo", messages=messages, stream=True)
-        complete_response = ""
-        start_time = time.time()
-        for idx, chunk in enumerate(response):
-            chunk, finished = streaming_format_tests(idx, chunk)
-            print(f"outside chunk: {chunk}")
-            if finished:
-                break
-            complete_response += chunk
-            # print(f'complete_chunk: {complete_response}')
-        if complete_response.strip() == "":
-            raise Exception("Empty response received")
-        print(f"complete response: {complete_response}")
-    except:
-        print(f"error occurred: {traceback.format_exc()}")
-        pass
+    litellm.set_verbose = False
+    litellm.return_response_headers = True
+    print(f"making openai chat completion call")
+    response = completion(model="gpt-3.5-turbo", messages=messages, stream=True)
+    assert isinstance(
+        response._hidden_params["additional_headers"][
+            "llm_provider-x-ratelimit-remaining-requests"
+        ],
+        str,
+    )
+
+    print(f"response._hidden_params: {response._hidden_params}")
+    complete_response = ""
+    start_time = time.time()
+    for idx, chunk in enumerate(response):
+        chunk, finished = streaming_format_tests(idx, chunk)
+        print(f"outside chunk: {chunk}")
+        if finished:
+            break
+        complete_response += chunk
+        # print(f'complete_chunk: {complete_response}')
+    if complete_response.strip() == "":
+        raise Exception("Empty response received")
+    print(f"complete response: {complete_response}")


 # test_openai_chat_completion_call()
--- a/litellm/types/llms/anthropic.py
+++ b/litellm/types/llms/anthropic.py
@ -1,4 +1,4 @@
-from typing import Iterable, List, Optional, Union
+from typing import Any, Dict, Iterable, List, Optional, Union

 from pydantic import BaseModel, validator
 from typing_extensions import Literal, Required, TypedDict
@ -113,6 +113,9 @@ class AnthropicMessagesRequest(TypedDict, total=False):
    top_k: int
    top_p: float

+    # litellm param - used for tracking litellm proxy metadata in the request
+    litellm_metadata: dict
+

 class ContentTextBlockDelta(TypedDict):
    """
--- a/litellm/types/llms/openai.py
+++ b/litellm/types/llms/openai.py
@ -436,6 +436,7 @@ class ChatCompletionRequest(TypedDict, total=False):
    function_call: Union[str, dict]
    functions: List
    user: str
+    metadata: dict  # litellm specific param


 class ChatCompletionDeltaChunk(TypedDict, total=False):
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@ -1029,3 +1029,22 @@ class GenericImageParsingChunk(TypedDict):
 class ResponseFormatChunk(TypedDict, total=False):
    type: Required[Literal["json_object", "text"]]
    response_schema: dict
+
+
+class LoggedLiteLLMParams(TypedDict, total=False):
+    force_timeout: Optional[float]
+    custom_llm_provider: Optional[str]
+    api_base: Optional[str]
+    litellm_call_id: Optional[str]
+    model_alias_map: Optional[dict]
+    metadata: Optional[dict]
+    model_info: Optional[dict]
+    proxy_server_request: Optional[dict]
+    acompletion: Optional[bool]
+    preset_cache_key: Optional[str]
+    no_log: Optional[bool]
+    input_cost_per_second: Optional[float]
+    input_cost_per_token: Optional[float]
+    output_cost_per_token: Optional[float]
+    output_cost_per_second: Optional[float]
+    cooldown_time: Optional[float]
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -129,6 +129,7 @@ from .exceptions import (
    ServiceUnavailableError,
    Timeout,
    UnprocessableEntityError,
+    UnsupportedParamsError,
 )
 from .proxy._types import KeyManagementSystem
 from .types.llms.openai import (
@ -158,6 +159,7 @@ from typing import (
    Tuple,
    Union,
    cast,
+    get_args,
 )

 from .caching import Cache
@ -224,17 +226,6 @@ last_fetched_at_keys = None
 # }


-class UnsupportedParamsError(Exception):
-    def __init__(self, status_code, message):
-        self.status_code = status_code
-        self.message = message
-        self.request = httpx.Request(method="POST", url=" https://openai.api.com/v1/")
-        self.response = httpx.Response(status_code=status_code, request=self.request)
-        super().__init__(
-            self.message
-        )  # Call the base class constructor with the parameters it needs
-
-
 ############################################################
 def print_verbose(
    print_statement,
@ -405,7 +396,6 @@ def function_setup(
            # Pop the async items from input_callback in reverse order to avoid index issues
            for index in reversed(removed_async_items):
                litellm.input_callback.pop(index)
-
        if len(litellm.success_callback) > 0:
            removed_async_items = []
            for index, callback in enumerate(litellm.success_callback):  # type: ignore
@ -417,9 +407,9 @@ def function_setup(
                    # we only support async dynamo db logging for acompletion/aembedding since that's used on proxy
                    litellm._async_success_callback.append(callback)
                    removed_async_items.append(index)
-                elif callback == "langsmith":
+                elif callback in litellm._known_custom_logger_compatible_callbacks:
                    callback_class = litellm.litellm_core_utils.litellm_logging._init_custom_logger_compatible_class(  # type: ignore
-                        callback, internal_usage_cache=None, llm_router=None
+                        callback, internal_usage_cache=None, llm_router=None  # type: ignore
                    )

                    # don't double add a callback
@ -3088,6 +3078,15 @@ def get_optional_params(
            non_default_params=non_default_params,
            optional_params=optional_params,
        )
+    elif custom_llm_provider == "vertex_ai" and model in litellm.vertex_llama3_models:
+        supported_params = get_supported_openai_params(
+            model=model, custom_llm_provider=custom_llm_provider
+        )
+        _check_valid_arg(supported_params=supported_params)
+        optional_params = litellm.VertexAILlama3Config().map_openai_params(
+            non_default_params=non_default_params,
+            optional_params=optional_params,
+        )
    elif custom_llm_provider == "sagemaker":
        ## check if unsupported param passed in
        supported_params = get_supported_openai_params(
@ -4189,6 +4188,9 @@ def get_supported_openai_params(
        return litellm.GoogleAIStudioGeminiConfig().get_supported_openai_params()
    elif custom_llm_provider == "vertex_ai":
        if request_type == "chat_completion":
+            if model.startswith("meta/"):
+                return litellm.VertexAILlama3Config().get_supported_openai_params()
+
            return litellm.VertexAIConfig().get_supported_openai_params()
        elif request_type == "embeddings":
            return litellm.VertexAITextEmbeddingConfig().get_supported_openai_params()
@ -4484,7 +4486,11 @@ def get_llm_provider(
                    or get_secret("TOGETHER_AI_TOKEN")
                )
            elif custom_llm_provider == "friendliai":
-                api_base = "https://inference.friendli.ai/v1"
+                api_base = (
+                    api_base
+                    or get_secret("FRIENDLI_API_BASE")
+                    or "https://inference.friendli.ai/v1"
+                )
                dynamic_api_key = (
                    api_key
                    or get_secret("FRIENDLIAI_API_KEY")
@ -5678,6 +5684,14 @@ def convert_to_model_response_object(
    _response_headers: Optional[dict] = None,
 ):
    received_args = locals()
+    if _response_headers is not None:
+        llm_response_headers = {
+            "{}-{}".format("llm_provider", k): v for k, v in _response_headers.items()
+        }
+        if hidden_params is not None:
+            hidden_params["additional_headers"] = llm_response_headers
+        else:
+            hidden_params = {"additional_headers": llm_response_headers}
    ### CHECK IF ERROR IN RESPONSE ### - openrouter returns these in the dictionary
    if (
        response_object is not None
@ -5744,10 +5758,12 @@ def convert_to_model_response_object(
                model_response_object.usage.total_tokens = response_object["usage"].get("total_tokens", 0)  # type: ignore

            if "created" in response_object:
-                model_response_object.created = response_object["created"]
+                model_response_object.created = response_object["created"] or int(
+                    time.time()
+                )

            if "id" in response_object:
-                model_response_object.id = response_object["id"]
+                model_response_object.id = response_object["id"] or str(uuid.uuid4())

            if "system_fingerprint" in response_object:
                model_response_object.system_fingerprint = response_object[
@ -8312,8 +8328,13 @@ class CustomStreamWrapper:
            or {}
        )
        self._hidden_params = {
-            "model_id": (_model_info.get("id", None))
+            "model_id": (_model_info.get("id", None)),
        }  # returned as x-litellm-model-id response header in proxy
+        if _response_headers is not None:
+            self._hidden_params["additional_headers"] = {
+                "{}-{}".format("llm_provider", k): v
+                for k, v in _response_headers.items()
+            }
        self._response_headers = _response_headers
        self.response_id = None
        self.logging_loop = None
@ -8808,11 +8829,14 @@ class CustomStreamWrapper:
                                str_line.choices[0].content_filter_result
                            )
                        else:
-                            error_message = "Azure Response={}".format(
-                                str(dict(str_line))
+                            error_message = "{} Response={}".format(
+                                self.custom_llm_provider, str(dict(str_line))
                            )
-                        raise litellm.AzureOpenAIError(
-                            status_code=400, message=error_message
+
+                        raise litellm.ContentPolicyViolationError(
+                            message=error_message,
+                            llm_provider=self.custom_llm_provider,
+                            model=self.model,
                        )

                # checking for logprobs
@ -9094,6 +9118,42 @@ class CustomStreamWrapper:
        except Exception as e:
            raise e

+    def handle_triton_stream(self, chunk):
+        try:
+            if isinstance(chunk, dict):
+                parsed_response = chunk
+            elif isinstance(chunk, (str, bytes)):
+                if isinstance(chunk, bytes):
+                    chunk = chunk.decode("utf-8")
+                if "text_output" in chunk:
+                    response = chunk.replace("data: ", "").strip()
+                    parsed_response = json.loads(response)
+                else:
+                    return {
+                        "text": "",
+                        "is_finished": False,
+                        "prompt_tokens": 0,
+                        "completion_tokens": 0,
+                    }
+            else:
+                print_verbose(f"chunk: {chunk} (Type: {type(chunk)})")
+                raise ValueError(
+                    f"Unable to parse response. Original response: {chunk}"
+                )
+            text = parsed_response.get("text_output", "")
+            finish_reason = parsed_response.get("stop_reason")
+            is_finished = parsed_response.get("is_finished", False)
+            return {
+                "text": text,
+                "is_finished": is_finished,
+                "finish_reason": finish_reason,
+                "prompt_tokens": parsed_response.get("input_token_count", 0),
+                "completion_tokens": parsed_response.get("generated_token_count", 0),
+            }
+            return {"text": "", "is_finished": False}
+        except Exception as e:
+            raise e
+
    def handle_clarifai_completion_chunk(self, chunk):
        try:
            if isinstance(chunk, dict):
@ -9513,6 +9573,12 @@ class CustomStreamWrapper:
                completion_obj["content"] = response_obj["text"]
                if response_obj["is_finished"]:
                    self.received_finish_reason = response_obj["finish_reason"]
+            elif self.custom_llm_provider == "triton":
+                response_obj = self.handle_triton_stream(chunk)
+                completion_obj["content"] = response_obj["text"]
+                print_verbose(f"completion obj content: {completion_obj['content']}")
+                if response_obj["is_finished"]:
+                    self.received_finish_reason = response_obj["finish_reason"]
            elif self.custom_llm_provider == "text-completion-openai":
                response_obj = self.handle_openai_text_completion_chunk(chunk)
                completion_obj["content"] = response_obj["text"]
@ -10068,6 +10134,7 @@ class CustomStreamWrapper:
                or self.custom_llm_provider == "predibase"
                or self.custom_llm_provider == "databricks"
                or self.custom_llm_provider == "bedrock"
+                or self.custom_llm_provider == "triton"
                or self.custom_llm_provider == "watsonx"
                or self.custom_llm_provider in litellm.openai_compatible_endpoints
            ):
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -760,6 +760,36 @@
        "litellm_provider": "azure_ai",
        "mode": "chat"
    },
+    "azure_ai/Meta-Llama-31-8B-Instruct": {
+        "max_tokens": 128000,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 128000,
+        "input_cost_per_token": 0.0000003,
+        "output_cost_per_token": 0.00000061,
+        "litellm_provider": "azure_ai",
+        "mode": "chat",
+        "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-8b-instruct-offer?tab=PlansAndPrice"
+    },
+    "azure_ai/Meta-Llama-31-70B-Instruct": {
+        "max_tokens": 128000,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 128000,
+        "input_cost_per_token": 0.00000268,
+        "output_cost_per_token": 0.00000354,
+        "litellm_provider": "azure_ai",
+        "mode": "chat",
+        "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-70b-instruct-offer?tab=PlansAndPrice"
+    },
+    "azure_ai/Meta-Llama-31-405B-Instruct": {
+        "max_tokens": 128000,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 128000,
+        "input_cost_per_token": 0.00000533,
+        "output_cost_per_token": 0.000016,
+        "litellm_provider": "azure_ai",
+        "mode": "chat",
+        "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-405b-instruct-offer?tab=PlansAndPrice"
+    },
    "babbage-002": {
        "max_tokens": 16384,
        "max_input_tokens": 16384,
@ -1948,6 +1978,16 @@
        "supports_function_calling": true,
        "supports_vision": true
    },
+    "vertex_ai/meta/llama3-405b-instruct-maas": {
+        "max_tokens": 32000,
+        "max_input_tokens": 32000,
+        "max_output_tokens": 32000,
+        "input_cost_per_token": 0.0,
+        "output_cost_per_token": 0.0,
+        "litellm_provider": "vertex_ai-llama_models",
+        "mode": "chat",
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/pricing#partner-models"
+    },
    "vertex_ai/imagegeneration@006": {
        "cost_per_image": 0.020,
        "litellm_provider": "vertex_ai-image-models",
@ -3633,6 +3673,24 @@
        "litellm_provider": "bedrock",
        "mode": "chat"
    },
+    "meta.llama3-1-8b-instruct-v1:0": {
+        "max_tokens": 128000,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 2048,
+        "input_cost_per_token": 0.0000004,
+        "output_cost_per_token": 0.0000006,
+        "litellm_provider": "bedrock",
+        "mode": "chat"
+    },
+    "meta.llama3-1-70b-instruct-v1:0": {
+        "max_tokens": 128000,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 2048,
+        "input_cost_per_token": 0.00000265,
+        "output_cost_per_token": 0.0000035,
+        "litellm_provider": "bedrock",
+        "mode": "chat"
+    },
    "512-x-512/50-steps/stability.stable-diffusion-xl-v0": {
        "max_tokens": 77, 
        "max_input_tokens": 77, 
--- a/prometheus.yml
+++ b/prometheus.yml
@ -0,0 +1,7 @@
+global:
+  scrape_interval: 15s
+
+scrape_configs:
+  - job_name: 'litellm'
+    static_configs:
+      - targets: ['litellm:4000']  # Assuming Litellm exposes metrics at port 4000
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.41.26"
+version = "1.42.0"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@ -91,7 +91,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"

 [tool.commitizen]
-version = "1.41.26"
+version = "1.42.0"
 version_files = [
    "pyproject.toml:^version"
 ]
--- a/schema.prisma
+++ b/schema.prisma
@ -172,7 +172,7 @@ model LiteLLM_Config {
 model LiteLLM_SpendLogs {
  request_id          String @id
  call_type           String
-  api_key             String  @default ("")
+  api_key             String  @default ("") // Hashed API Token. Not the actual Virtual Key. Equivalent to 'token' column in LiteLLM_VerificationToken
  spend               Float    @default(0.0)
  total_tokens        Int     @default(0)
  prompt_tokens       Int     @default(0)
@ -183,12 +183,12 @@ model LiteLLM_SpendLogs {
  model               String   @default("")
  model_id            String?   @default("") // the model id stored in proxy model db
  model_group         String?   @default("") // public model_name / model_group
-  api_base            String   @default("")
-  user                String   @default("")
-  metadata            Json     @default("{}")
-  cache_hit           String   @default("")
-  cache_key           String   @default("")
-  request_tags        Json     @default("[]")
+  api_base            String?   @default("")
+  user                String?   @default("")
+  metadata            Json?     @default("{}")
+  cache_hit           String?   @default("")
+  cache_key           String?   @default("")
+  request_tags        Json?     @default("[]")
  team_id             String? 
  end_user            String?
  requester_ip_address String?