Merge branch 'main' into feat/friendliai

2024-06-13 09:59:56 +09:00 · 2024-06-13 09:59:56 +09:00 · 776c75c1e5
commit 776c75c1e5
parent c75bb09651 06ac381d57
99 changed files with 202794 additions and 632 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -202,6 +202,7 @@ jobs:
              -e REDIS_PORT=$REDIS_PORT \
              -e AZURE_FRANCE_API_KEY=$AZURE_FRANCE_API_KEY \
              -e AZURE_EUROPE_API_KEY=$AZURE_EUROPE_API_KEY \
+              -e MISTRAL_API_KEY=$MISTRAL_API_KEY \
              -e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
              -e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
              -e AWS_REGION_NAME=$AWS_REGION_NAME \
--- a/docs/my-website/docs/assistants.md
+++ b/docs/my-website/docs/assistants.md
@ -150,7 +150,7 @@ $ litellm --config /path/to/config.yaml
 ```bash
 curl "http://0.0.0.0:4000/v1/assistants?order=desc&limit=20" \
  -H "Content-Type: application/json" \
-  -H "Authorization: Bearer sk-1234" \
+  -H "Authorization: Bearer sk-1234"
 ```

 **Create a Thread**
@ -162,6 +162,14 @@ curl http://0.0.0.0:4000/v1/threads \
  -d ''
 ```

+**Get a Thread**
+
+```bash
+curl http://0.0.0.0:4000/v1/threads/{thread_id} \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer sk-1234"
+```
+
 **Add Messages to the Thread**

 ```bash
--- a/docs/my-website/docs/caching/all_caches.md
+++ b/docs/my-website/docs/caching/all_caches.md
@ -212,6 +212,94 @@ If you run the code two times, response1 will use the cache from the first run t

 </TabItem>

+</Tabs>
+
+## Switch Cache On / Off Per LiteLLM Call 
+
+LiteLLM supports 4 cache-controls:
+
+- `no-cache`: *Optional(bool)* When `True`, Will not return a cached response, but instead call the actual endpoint. 
+- `no-store`: *Optional(bool)* When `True`, Will not cache the response. 
+- `ttl`: *Optional(int)* - Will cache the response for the user-defined amount of time (in seconds).
+- `s-maxage`: *Optional(int)* Will only accept cached responses that are within user-defined range (in seconds).
+
+[Let us know if you need more](https://github.com/BerriAI/litellm/issues/1218)
+<Tabs>
+<TabItem value="no-cache" label="No-Cache">
+
+Example usage `no-cache` - When `True`, Will not return a cached response
+
+```python
+response = litellm.completion(
+        model="gpt-3.5-turbo",
+        messages=[
+            {
+                "role": "user",
+                "content": "hello who are you"
+            }
+        ],
+        cache={"no-cache": True},
+    )
+```
+
+</TabItem>
+
+<TabItem value="no-store" label="No-Store">
+
+Example usage `no-store` - When `True`, Will not cache the response. 
+
+```python
+response = litellm.completion(
+        model="gpt-3.5-turbo",
+        messages=[
+            {
+                "role": "user",
+                "content": "hello who are you"
+            }
+        ],
+        cache={"no-store": True},
+    )
+```
+
+</TabItem>
+
+<TabItem value="ttl" label="ttl">
+Example usage `ttl` - cache the response for 10 seconds
+
+```python
+response = litellm.completion(
+        model="gpt-3.5-turbo",
+        messages=[
+            {
+                "role": "user",
+                "content": "hello who are you"
+            }
+        ],
+        cache={"ttl": 10},
+    )
+```
+
+</TabItem>
+
+<TabItem value="s-maxage" label="s-maxage">
+Example usage `s-maxage` - Will only accept cached responses for 60 seconds
+
+```python
+response = litellm.completion(
+        model="gpt-3.5-turbo",
+        messages=[
+            {
+                "role": "user",
+                "content": "hello who are you"
+            }
+        ],
+        cache={"s-maxage": 60},
+    )
+```
+
+</TabItem>
+
+
 </Tabs>

 ## Cache Context Manager - Enable, Disable, Update Cache
--- a/docs/my-website/docs/observability/raw_request_response.md
+++ b/docs/my-website/docs/observability/raw_request_response.md
@ -0,0 +1,46 @@
+import Image from '@theme/IdealImage';
+
+# Raw Request/Response Logging
+
+See the raw request/response sent by LiteLLM in your logging provider (OTEL/Langfuse/etc.).
+
+**on SDK**
+```python
+# pip install langfuse 
+import litellm
+import os
+
+# log raw request/response
+litellm.log_raw_request_response = True
+
+# from https://cloud.langfuse.com/
+os.environ["LANGFUSE_PUBLIC_KEY"] = ""
+os.environ["LANGFUSE_SECRET_KEY"] = ""
+# Optional, defaults to https://cloud.langfuse.com
+os.environ["LANGFUSE_HOST"] # optional
+
+# LLM API Keys
+os.environ['OPENAI_API_KEY']=""
+
+# set langfuse as a callback, litellm will send the data to langfuse
+litellm.success_callback = ["langfuse"] 
+ 
+# openai call
+response = litellm.completion(
+  model="gpt-3.5-turbo",
+  messages=[
+    {"role": "user", "content": "Hi 👋 - i'm openai"}
+  ]
+)
+```
+
+**on Proxy**
+
+```yaml
+litellm_settings:
+  log_raw_request_response: True
+```
+
+**Expected Log**
+
+<Image img={require('../../img/raw_request_log.png')}/>
--- a/docs/my-website/docs/projects/llm_cord.md
+++ b/docs/my-website/docs/projects/llm_cord.md
@ -1,3 +1,5 @@
+# llmcord.py
+
 llmcord.py lets you and your friends chat with LLMs directly in your Discord server. It works with practically any LLM, remote or locally hosted.

 Github: https://github.com/jakobdylanc/discord-llm-chatbot
--- a/docs/my-website/docs/providers/anthropic.md
+++ b/docs/my-website/docs/providers/anthropic.md
@ -11,7 +11,7 @@ LiteLLM supports

 :::info

-Anthropic API fails requests when `max_tokens` are not passed. Due to this litellm passes `max_tokens=4096` when no `max_tokens` are passed
+Anthropic API fails requests when `max_tokens` are not passed. Due to this litellm passes `max_tokens=4096` when no `max_tokens` are passed.

 :::

@ -229,17 +229,6 @@ assert isinstance(

 ```

-### Setting `anthropic-beta` Header in Requests
-
-Pass the the `extra_headers` param to litellm, All headers will be forwarded to Anthropic API
-
-```python
-response = completion(
-    model="anthropic/claude-3-opus-20240229",
-    messages=messages,
-    tools=tools,
-)
-```

 ### Forcing Anthropic Tool Use

--- a/docs/my-website/docs/providers/azure_ai.md
+++ b/docs/my-website/docs/providers/azure_ai.md
@ -3,53 +3,155 @@ import TabItem from '@theme/TabItem';

 # Azure AI Studio

-**Ensure the following:**
-1. The API Base passed ends in the `/v1/` prefix
-  example:
-  ```python
-  api_base = "https://Mistral-large-dfgfj-serverless.eastus2.inference.ai.azure.com/v1/"
-  ```
+LiteLLM supports all models on Azure AI Studio

-2. The `model` passed is listed in [supported models](#supported-models). You **DO NOT** Need to pass your deployment name to litellm. Example `model=azure/Mistral-large-nmefg`  

 ## Usage

 <Tabs>
 <TabItem value="sdk" label="SDK">

+### ENV VAR
 ```python
-import litellm
-response = litellm.completion(
-    model="azure/command-r-plus",
-    api_base="<your-deployment-base>/v1/"
-    api_key="eskk******"
-    messages=[{"role": "user", "content": "What is the meaning of life?"}],
+import os 
+os.environ["AZURE_API_API_KEY"] = ""
+os.environ["AZURE_AI_API_BASE"] = ""
+```
+
+### Example Call
+
+```python
+from litellm import completion
+import os
+## set ENV variables
+os.environ["AZURE_API_API_KEY"] = "azure ai key"
+os.environ["AZURE_AI_API_BASE"] = "azure ai base url" # e.g.: https://Mistral-large-dfgfj-serverless.eastus2.inference.ai.azure.com/
+
+# predibase llama-3 call
+response = completion(
+    model="azure_ai/command-r-plus", 
+    messages = [{ "content": "Hello, how are you?","role": "user"}]
 )
 ```

 </TabItem>
 <TabItem value="proxy" label="PROXY">

-## Sample Usage - LiteLLM Proxy
-
 1. Add models to your config.yaml

  ```yaml
  model_list:
-    - model_name: mistral
-      litellm_params:
-        model: azure/mistral-large-latest
-        api_base: https://Mistral-large-dfgfj-serverless.eastus2.inference.ai.azure.com/v1/
-        api_key: JGbKodRcTp****
    - model_name: command-r-plus
      litellm_params:
-          model: azure/command-r-plus
-          api_key: os.environ/AZURE_COHERE_API_KEY
-          api_base: os.environ/AZURE_COHERE_API_BASE
+        model: azure_ai/command-r-plus
+        api_key: os.environ/AZURE_AI_API_KEY
+        api_base: os.environ/AZURE_AI_API_BASE
  ```



+2. Start the proxy 
+
+  ```bash
+  $ litellm --config /path/to/config.yaml --debug
+  ```
+
+3. Send Request to LiteLLM Proxy Server
+
+  <Tabs>
+
+  <TabItem value="openai" label="OpenAI Python v1.0.0+">
+
+  ```python
+  import openai
+  client = openai.OpenAI(
+      api_key="sk-1234",             # pass litellm proxy key, if you're using virtual keys
+      base_url="http://0.0.0.0:4000" # litellm-proxy-base url
+  )
+
+  response = client.chat.completions.create(
+      model="command-r-plus",
+      messages = [
+        {
+            "role": "system",
+            "content": "Be a good human!"
+        },
+        {
+            "role": "user",
+            "content": "What do you know about earth?"
+        }
+    ]
+  )
+
+  print(response)
+  ```
+
+  </TabItem>
+
+  <TabItem value="curl" label="curl">
+
+  ```shell
+  curl --location 'http://0.0.0.0:4000/chat/completions' \
+      --header 'Authorization: Bearer sk-1234' \
+      --header 'Content-Type: application/json' \
+      --data '{
+      "model": "command-r-plus",
+      "messages": [
+        {
+            "role": "system",
+            "content": "Be a good human!"
+        },
+        {
+            "role": "user",
+            "content": "What do you know about earth?"
+        }
+        ],
+  }'
+  ```
+  </TabItem>
+
+  </Tabs>
+
+
+</TabItem>
+
+</Tabs>
+
+## Passing additional params - max_tokens, temperature 
+See all litellm.completion supported params [here](../completion/input.md#translated-openai-params)
+
+```python
+# !pip install litellm
+from litellm import completion
+import os
+## set ENV variables
+os.environ["AZURE_AI_API_KEY"] = "azure ai api key"
+os.environ["AZURE_AI_API_BASE"] = "azure ai api base"
+
+# command r plus call
+response = completion(
+    model="azure_ai/command-r-plus", 
+    messages = [{ "content": "Hello, how are you?","role": "user"}],
+    max_tokens=20,
+    temperature=0.5
+)
+```
+
+**proxy**
+
+```yaml
+  model_list:
+    - model_name: command-r-plus
+      litellm_params:
+        model: azure_ai/command-r-plus
+        api_key: os.environ/AZURE_AI_API_KEY
+        api_base: os.environ/AZURE_AI_API_BASE
+        max_tokens: 20
+        temperature: 0.5
+```
+
+
+
 2. Start the proxy 

  ```bash
@ -103,9 +205,6 @@ response = litellm.completion(

  </Tabs>

-</TabItem>
-</Tabs>
-
 ## Function Calling 

 <Tabs>
@ -115,8 +214,8 @@ response = litellm.completion(
 from litellm import completion

 # set env
-os.environ["AZURE_MISTRAL_API_KEY"] = "your-api-key"
-os.environ["AZURE_MISTRAL_API_BASE"] = "your-api-base"
+os.environ["AZURE_AI_API_KEY"] = "your-api-key"
+os.environ["AZURE_AI_API_BASE"] = "your-api-base"

 tools = [
    {
@ -141,9 +240,7 @@ tools = [
 messages = [{"role": "user", "content": "What's the weather like in Boston today?"}]

 response = completion(
-    model="azure/mistral-large-latest",
-    api_base=os.getenv("AZURE_MISTRAL_API_BASE")
-    api_key=os.getenv("AZURE_MISTRAL_API_KEY")
+    model="azure_ai/mistral-large-latest",
    messages=messages,
    tools=tools,
    tool_choice="auto",
@ -206,10 +303,12 @@ curl http://0.0.0.0:4000/v1/chat/completions \

 ## Supported Models

+LiteLLM supports **ALL** azure ai models. Here's a few examples:
+
 | Model Name               | Function Call                                                                                                                                                      |
 |--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | Cohere command-r-plus | `completion(model="azure/command-r-plus", messages)` | 
-| Cohere ommand-r | `completion(model="azure/command-r", messages)` | 
+| Cohere command-r | `completion(model="azure/command-r", messages)` | 
 | mistral-large-latest | `completion(model="azure/mistral-large-latest", messages)` | 


--- a/docs/my-website/docs/providers/bedrock.md
+++ b/docs/my-website/docs/providers/bedrock.md
@ -144,16 +144,135 @@ print(response)
 </TabItem>
 </Tabs>

+## Set temperature, top p, etc.
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+import os
+from litellm import completion
+
+os.environ["AWS_ACCESS_KEY_ID"] = ""
+os.environ["AWS_SECRET_ACCESS_KEY"] = ""
+os.environ["AWS_REGION_NAME"] = ""
+
+response = completion(
+  model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
+  messages=[{ "content": "Hello, how are you?","role": "user"}],
+  temperature=0.7,
+  top_p=1
+)
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+**Set on yaml**
+
+```yaml
+model_list:
+  - model_name: bedrock-claude-v1
+    litellm_params:
+      model: bedrock/anthropic.claude-instant-v1
+      temperature: <your-temp>
+      top_p: <your-top-p>
+```
+
+**Set on request**
+
+```python
+
+import openai
+client = openai.OpenAI(
+    api_key="anything",
+    base_url="http://0.0.0.0:4000"
+)
+
+# request sent to model set on litellm proxy, `litellm --model`
+response = client.chat.completions.create(model="bedrock-claude-v1", messages = [
+    {
+        "role": "user",
+        "content": "this is a test request, write a short poem"
+    }
+],
+temperature=0.7,
+top_p=1
+)
+
+print(response)
+
+```
+
+</TabItem>
+</Tabs>
+
+## Pass provider-specific params 
+
+If you pass a non-openai param to litellm, we'll assume it's provider-specific and send it as a kwarg in the request body. [See more](../completion/input.md#provider-specific-params)
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+import os
+from litellm import completion
+
+os.environ["AWS_ACCESS_KEY_ID"] = ""
+os.environ["AWS_SECRET_ACCESS_KEY"] = ""
+os.environ["AWS_REGION_NAME"] = ""
+
+response = completion(
+  model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
+  messages=[{ "content": "Hello, how are you?","role": "user"}],
+  top_k=1 # 👈 PROVIDER-SPECIFIC PARAM
+)
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+**Set on yaml**
+
+```yaml
+model_list:
+  - model_name: bedrock-claude-v1
+    litellm_params:
+      model: bedrock/anthropic.claude-instant-v1
+      top_k: 1 # 👈 PROVIDER-SPECIFIC PARAM
+```
+
+**Set on request**
+
+```python
+
+import openai
+client = openai.OpenAI(
+    api_key="anything",
+    base_url="http://0.0.0.0:4000"
+)
+
+# request sent to model set on litellm proxy, `litellm --model`
+response = client.chat.completions.create(model="bedrock-claude-v1", messages = [
+    {
+        "role": "user",
+        "content": "this is a test request, write a short poem"
+    }
+],
+temperature=0.7,
+extra_body={
+    top_k=1 # 👈 PROVIDER-SPECIFIC PARAM
+}
+)
+
+print(response)
+
+```
+
+</TabItem>
+</Tabs>
+
 ## Usage - Function Calling 

-:::info 
-
-Claude returns it's output as an XML Tree. [Here is how we translate it](https://github.com/BerriAI/litellm/blob/49642a5b00a53b1babc1a753426a8afcac85dbbe/litellm/llms/prompt_templates/factory.py#L734).
-
-You can see the raw response via `response._hidden_params["original_response"]`.
-
-Claude hallucinates, e.g. returning the list param `value` as `<value>\n<item>apple</item>\n<item>banana</item>\n</value>` or `<value>\n<list>\n<item>apple</item>\n<item>banana</item>\n</list>\n</value>`.
-:::
+LiteLLM uses Bedrock's Converse API for making tool calls

 ```python
 from litellm import completion
@ -361,47 +480,6 @@ response = completion(
 )
 ```

-### Passing an external BedrockRuntime.Client as a parameter - Completion()
-Pass an external BedrockRuntime.Client object as a parameter to litellm.completion. Useful when using an AWS credentials profile, SSO session, assumed role session, or if environment variables are not available for auth.
-
-Create a client from session credentials:
-```python
-import boto3
-from litellm import completion
-
-bedrock = boto3.client(
-            service_name="bedrock-runtime",
-            region_name="us-east-1",
-            aws_access_key_id="",
-            aws_secret_access_key="",
-            aws_session_token="",
-)
-
-response = completion(
-            model="bedrock/anthropic.claude-instant-v1",
-            messages=[{ "content": "Hello, how are you?","role": "user"}],
-            aws_bedrock_client=bedrock,
-)
-```
-
-Create a client from AWS profile in `~/.aws/config`:
-```python
-import boto3
-from litellm import completion
-
-dev_session = boto3.Session(profile_name="dev-profile")
-bedrock = dev_session.client(
-            service_name="bedrock-runtime",
-            region_name="us-east-1",
-)
-
-response = completion(
-            model="bedrock/anthropic.claude-instant-v1",
-            messages=[{ "content": "Hello, how are you?","role": "user"}],
-            aws_bedrock_client=bedrock,
-)
-```
-
 ### SSO Login (AWS Profile)
 - Set `AWS_PROFILE` environment variable
 - Make bedrock completion call
@ -464,6 +542,56 @@ response = completion(
        )
 ```

+
+### Passing an external BedrockRuntime.Client as a parameter - Completion()
+
+:::warning
+
+This is a deprecated flow. Boto3 is not async. And boto3.client does not let us make the http call through httpx. Pass in your aws params through the method above 👆. [See Auth Code](https://github.com/BerriAI/litellm/blob/55a20c7cce99a93d36a82bf3ae90ba3baf9a7f89/litellm/llms/bedrock_httpx.py#L284) [Add new auth flow](https://github.com/BerriAI/litellm/issues)
+
+:::
+
+Pass an external BedrockRuntime.Client object as a parameter to litellm.completion. Useful when using an AWS credentials profile, SSO session, assumed role session, or if environment variables are not available for auth.
+
+Create a client from session credentials:
+```python
+import boto3
+from litellm import completion
+
+bedrock = boto3.client(
+            service_name="bedrock-runtime",
+            region_name="us-east-1",
+            aws_access_key_id="",
+            aws_secret_access_key="",
+            aws_session_token="",
+)
+
+response = completion(
+            model="bedrock/anthropic.claude-instant-v1",
+            messages=[{ "content": "Hello, how are you?","role": "user"}],
+            aws_bedrock_client=bedrock,
+)
+```
+
+Create a client from AWS profile in `~/.aws/config`:
+```python
+import boto3
+from litellm import completion
+
+dev_session = boto3.Session(profile_name="dev-profile")
+bedrock = dev_session.client(
+            service_name="bedrock-runtime",
+            region_name="us-east-1",
+)
+
+response = completion(
+            model="bedrock/anthropic.claude-instant-v1",
+            messages=[{ "content": "Hello, how are you?","role": "user"}],
+            aws_bedrock_client=bedrock,
+)
+```
+
+
 ## Provisioned throughput models
 To use provisioned throughput Bedrock models pass 
 - `model=bedrock/<base-model>`, example `model=bedrock/anthropic.claude-v2`. Set `model` to any of the [Supported AWS models](#supported-aws-bedrock-models)
--- a/docs/my-website/docs/providers/clarifai.md
+++ b/docs/my-website/docs/providers/clarifai.md
@ -1,10 +1,13 @@
-# 🆕 Clarifai
+# Clarifai
 Anthropic, OpenAI, Mistral, Llama and Gemini LLMs are Supported on Clarifai. 

+:::warning 
+
+Streaming is not yet supported on using clarifai and litellm. Tracking support here: https://github.com/BerriAI/litellm/issues/4162
+
+:::
+
 ## Pre-Requisites
-
-`pip install clarifai`
-
 `pip install litellm`

 ## Required Environment Variables
@ -12,6 +15,7 @@ To obtain your Clarifai Personal access token follow this [link](https://docs.cl

 ```python
 os.environ["CLARIFAI_API_KEY"] = "YOUR_CLARIFAI_PAT"  # CLARIFAI_PAT
+
 ```

 ## Usage
@ -68,7 +72,7 @@ Example  Usage - Note: liteLLM supports all models deployed on Clarifai
 | clarifai/meta.Llama-2.codeLlama-70b-Python   | `completion('clarifai/meta.Llama-2.codeLlama-70b-Python', messages)`| 
 | clarifai/meta.Llama-2.codeLlama-70b-Instruct | `completion('clarifai/meta.Llama-2.codeLlama-70b-Instruct', messages)` |   

-## Mistal LLMs
+## Mistral LLMs
 | Model Name                                  | Function Call                                                         |
 |---------------------------------------------|------------------------------------------------------------------------|
 | clarifai/mistralai.completion.mixtral-8x22B            | `completion('clarifai/mistralai.completion.mixtral-8x22B', messages)`               |
--- a/docs/my-website/docs/providers/databricks.md
+++ b/docs/my-website/docs/providers/databricks.md
@ -125,11 +125,12 @@ See all litellm.completion supported params [here](../completion/input.md#transl
 from litellm import completion
 import os
 ## set ENV variables
-os.environ["PREDIBASE_API_KEY"] = "predibase key"
+os.environ["DATABRICKS_API_KEY"] = "databricks key"
+os.environ["DATABRICKS_API_BASE"] = "databricks api base"

-# predibae llama-3 call
+# databricks dbrx call
 response = completion(
-    model="predibase/llama3-8b-instruct", 
+    model="databricks/databricks-dbrx-instruct", 
    messages = [{ "content": "Hello, how are you?","role": "user"}],
    max_tokens=20,
    temperature=0.5
--- a/docs/my-website/docs/providers/vertex.md
+++ b/docs/my-website/docs/providers/vertex.md
@ -449,6 +449,54 @@ print(response)
 </TabItem>
 </Tabs>

+## Usage - Function Calling 
+
+LiteLLM supports Function Calling for Vertex AI gemini models. 
+
+```python
+from litellm import completion
+import os
+# set env
+os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = ".."
+os.environ["VERTEX_AI_PROJECT"] = ".."
+os.environ["VERTEX_AI_LOCATION"] = ".."
+
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city and state, e.g. San Francisco, CA",
+                    },
+                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+                },
+                "required": ["location"],
+            },
+        },
+    }
+]
+messages = [{"role": "user", "content": "What's the weather like in Boston today?"}]
+
+response = completion(
+    model="vertex_ai/gemini-pro-vision",
+    messages=messages,
+    tools=tools,
+)
+# Add any assertions, here to check response args
+print(response)
+assert isinstance(response.choices[0].message.tool_calls[0].function.name, str)
+assert isinstance(
+    response.choices[0].message.tool_calls[0].function.arguments, str
+)
+
+```
+

 ## Chat Models
 | Model Name       | Function Call                        |
@ -500,6 +548,8 @@ All models listed [here](https://github.com/BerriAI/litellm/blob/57f37f743886a02

 | Model Name               | Function Call                                                                                                                                                      |
 |--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| text-embedding-004 | `embedding(model="vertex_ai/text-embedding-004", input)` | 
+| text-multilingual-embedding-002 | `embedding(model="vertex_ai/text-multilingual-embedding-002", input)` | 
 | textembedding-gecko | `embedding(model="vertex_ai/textembedding-gecko", input)` | 
 | textembedding-gecko-multilingual | `embedding(model="vertex_ai/textembedding-gecko-multilingual", input)` | 
 | textembedding-gecko-multilingual@001 | `embedding(model="vertex_ai/textembedding-gecko-multilingual@001", input)` | 
@ -508,6 +558,29 @@ All models listed [here](https://github.com/BerriAI/litellm/blob/57f37f743886a02
 | text-embedding-preview-0409 | `embedding(model="vertex_ai/text-embedding-preview-0409", input)` |
 | text-multilingual-embedding-preview-0409 | `embedding(model="vertex_ai/text-multilingual-embedding-preview-0409", input)` | 

+### Advanced Use `task_type` and `title` (Vertex Specific Params)
+
+👉 `task_type` and `title` are vertex specific params
+
+LiteLLM Supported Vertex Specific Params
+
+```python
+auto_truncate: Optional[bool] = None
+task_type: Optional[Literal["RETRIEVAL_QUERY","RETRIEVAL_DOCUMENT", "SEMANTIC_SIMILARITY", "CLASSIFICATION", "CLUSTERING", "QUESTION_ANSWERING", "FACT_VERIFICATION"]] = None
+title: Optional[str] = None # The title of the document to be embedded. (only valid with task_type=RETRIEVAL_DOCUMENT).
+```
+
+**Example Usage with LiteLLM**
+```python
+response = litellm.embedding(
+    model="vertex_ai/text-embedding-004",
+    input=["good morning from litellm", "gm"]
+    task_type = "RETRIEVAL_DOCUMENT",
+    dimensions=1,
+    auto_truncate=True,
+)
+```
+
 ## Image Generation Models

 Usage 
--- a/docs/my-website/docs/proxy/cost_tracking.md
+++ b/docs/my-website/docs/proxy/cost_tracking.md
@ -138,14 +138,22 @@ Navigate to the Usage Tab on the LiteLLM UI (found on https://your-proxy-endpoin
 <Image img={require('../../img/admin_ui_spend.png')} />

 ## API Endpoints to get Spend
-#### Getting Spend Reports - To Charge Other Teams, API Keys
+#### Getting Spend Reports - To Charge Other Teams, Customers

-Use the `/global/spend/report` endpoint to get daily spend per team, with a breakdown of spend per API Key, Model
+Use the `/global/spend/report` endpoint to get daily spend report per 
+- team
+- customer [this is `user` passed to `/chat/completions` request](#how-to-track-spend-with-litellm)
+
+<Tabs>
+
+<TabItem value="per team" label="Spend Per Team">

 ##### Example Request

+👉 Key Change: Specify `group_by=team`
+
 ```shell
-curl -X GET 'http://localhost:4000/global/spend/report?start_date=2024-04-01&end_date=2024-06-30' \
+curl -X GET 'http://localhost:4000/global/spend/report?start_date=2024-04-01&end_date=2024-06-30&group_by=team' \
  -H 'Authorization: Bearer sk-1234'
 ```

@ -254,6 +262,69 @@ Output from script
 ```


+</TabItem>
+
+</Tabs>
+
+</TabItem>
+
+
+<TabItem value="per customer" label="Spend Per Customer">
+
+##### Example Request
+
+👉 Key Change: Specify `group_by=customer`
+
+
+```shell
+curl -X GET 'http://localhost:4000/global/spend/report?start_date=2024-04-01&end_date=2024-06-30&group_by=customer' \
+  -H 'Authorization: Bearer sk-1234'
+```
+
+##### Example Response
+
+
+```shell
+[
+    {
+        "group_by_day": "2024-04-30T00:00:00+00:00",
+        "customers": [
+            {
+                "customer": "palantir",
+                "total_spend": 0.0015265,
+                "metadata": [ # see the spend by unique(key + model)
+                    {
+                        "model": "gpt-4",
+                        "spend": 0.00123,
+                        "total_tokens": 28,
+                        "api_key": "88dc28.." # the hashed api key
+                    },
+                    {
+                        "model": "gpt-4",
+                        "spend": 0.00123,
+                        "total_tokens": 28,
+                        "api_key": "a73dc2.." # the hashed api key
+                    },
+                    {
+                        "model": "chatgpt-v-2",
+                        "spend": 0.000214,
+                        "total_tokens": 122,
+                        "api_key": "898c28.." # the hashed api key
+                    },
+                    {
+                        "model": "gpt-3.5-turbo",
+                        "spend": 0.0000825,
+                        "total_tokens": 85,
+                        "api_key": "84dc28.." # the hashed api key
+                    }
+                ]
+            }
+        ]
+    }
+]
+```
+
+
 </TabItem>

 </Tabs>
@ -356,4 +427,23 @@ model_list:

 ## Custom Input/Output Pricing

-👉 Head to [Custom Input/Output Pricing](https://docs.litellm.ai/docs/proxy/custom_pricing) to setup custom pricing or your models
+👉 Head to [Custom Input/Output Pricing](https://docs.litellm.ai/docs/proxy/custom_pricing) to setup custom pricing or your models
+
+## ✨ Custom k,v pairs
+
+Log specific key,value pairs as part of the metadata for a spend log
+
+:::info 
+
+Logging specific key,value pairs in spend logs metadata is an enterprise feature. [See here](./enterprise.md#tracking-spend-with-custom-metadata)
+
+:::
+
+
+## ✨ Custom Tags
+
+:::info 
+
+Tracking spend with Custom tags is an enterprise feature. [See here](./enterprise.md#tracking-spend-for-custom-tags)
+
+:::
--- a/docs/my-website/docs/proxy/debugging.md
+++ b/docs/my-website/docs/proxy/debugging.md
@ -42,6 +42,14 @@ Set `JSON_LOGS="True"` in your env:
 ```bash
 export JSON_LOGS="True"
 ```
+**OR**
+
+Set `json_logs: true` in your yaml: 
+
+```yaml
+litellm_settings:
+    json_logs: true
+```

 Start proxy 

@ -49,4 +57,35 @@ Start proxy
 $ litellm
 ```

-The proxy will now all logs in json format.
+The proxy will now all logs in json format.
+
+## Control Log Output 
+
+Turn off fastapi's default 'INFO' logs 
+
+1. Turn on 'json logs' 
+```yaml
+litellm_settings:
+    json_logs: true
+```
+
+2. Set `LITELLM_LOG` to 'ERROR' 
+
+Only get logs if an error occurs. 
+
+```bash
+LITELLM_LOG="ERROR"
+```
+
+3. Start proxy 
+
+
+```bash
+$ litellm
+```
+
+Expected Output: 
+
+```bash
+# no info statements
+```
--- a/docs/my-website/docs/proxy/deploy.md
+++ b/docs/my-website/docs/proxy/deploy.md
@ -1,5 +1,6 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
+import Image from '@theme/IdealImage';

 # 🐳 Docker, Deploying LiteLLM Proxy

@ -537,7 +538,9 @@ ghcr.io/berriai/litellm-database:main-latest --config your_config.yaml

 ## Advanced Deployment Settings

-### Customization of the server root path
+### 1. Customization of the server root path (custom Proxy base url)
+
+💥 Use this when you want to serve LiteLLM on a custom base url path like `https://localhost:4000/api/v1` 

 :::info

@ -548,9 +551,29 @@ In a Kubernetes deployment, it's possible to utilize a shared DNS to host multip
 Customize the root path to eliminate the need for employing multiple DNS configurations during deployment.

 👉 Set `SERVER_ROOT_PATH` in your .env and this will be set as your server root path
+```
+export SERVER_ROOT_PATH="/api/v1"
+```

+**Step 1. Run Proxy with `SERVER_ROOT_PATH` set in your env **

-### Setting SSL Certification 
+```shell
+docker run --name litellm-proxy \
+-e DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname> \
+-e SERVER_ROOT_PATH="/api/v1" \
+-p 4000:4000 \
+ghcr.io/berriai/litellm-database:main-latest --config your_config.yaml
+```
+
+After running the proxy you can access it on `http://0.0.0.0:4000/api/v1/` (since we set `SERVER_ROOT_PATH="/api/v1"`)
+
+**Step 2. Verify Running on correct path**
+
+<Image img={require('../../img/custom_root_path.png')} />
+
+**That's it**, that's all you need to run the proxy on a custom root path
+
+### 2. Setting SSL Certification 

 Use this, If you need to set ssl certificates for your on prem litellm proxy

--- a/docs/my-website/docs/proxy/enterprise.md
+++ b/docs/my-website/docs/proxy/enterprise.md
@ -205,6 +205,146 @@ curl -X GET "http://0.0.0.0:4000/spend/tags" \
 ```


+## Tracking Spend with custom metadata
+
+Requirements: 
+
+- Virtual Keys & a database should be set up, see [virtual keys](https://docs.litellm.ai/docs/proxy/virtual_keys)
+
+#### Usage - /chat/completions requests with special spend logs metadata 
+
+
+<Tabs>
+
+
+<TabItem value="openai" label="OpenAI Python v1.0.0+">
+
+Set `extra_body={"metadata": { }}` to `metadata` you want to pass
+
+```python
+import openai
+client = openai.OpenAI(
+    api_key="anything",
+    base_url="http://0.0.0.0:4000"
+)
+
+# request sent to model set on litellm proxy, `litellm --model`
+response = client.chat.completions.create(
+    model="gpt-3.5-turbo",
+    messages = [
+        {
+            "role": "user",
+            "content": "this is a test request, write a short poem"
+        }
+    ],
+    extra_body={
+        "metadata": {
+            "spend_logs_metadata": {
+                "hello": "world"
+            }
+        }
+    }
+)
+
+print(response)
+```
+</TabItem>
+
+<TabItem value="Curl" label="Curl Request">
+
+Pass `metadata` as part of the request body
+
+```shell
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+    --header 'Content-Type: application/json' \
+    --data '{
+    "model": "gpt-3.5-turbo",
+    "messages": [
+        {
+        "role": "user",
+        "content": "what llm are you"
+        }
+    ],
+    "metadata": {
+        "spend_logs_metadata": {
+            "hello": "world"
+        }
+    }
+}'
+```
+</TabItem>
+<TabItem value="langchain" label="Langchain">
+
+```python
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts.chat import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+    SystemMessagePromptTemplate,
+)
+from langchain.schema import HumanMessage, SystemMessage
+
+chat = ChatOpenAI(
+    openai_api_base="http://0.0.0.0:4000",
+    model = "gpt-3.5-turbo",
+    temperature=0.1,
+    extra_body={
+        "metadata": {
+            "spend_logs_metadata": {
+                "hello": "world"
+            }
+        }
+    }
+)
+
+messages = [
+    SystemMessage(
+        content="You are a helpful assistant that im using to make a test request to."
+    ),
+    HumanMessage(
+        content="test from litellm. tell me why it's amazing in 1 sentence"
+    ),
+]
+response = chat(messages)
+
+print(response)
+```
+
+</TabItem>
+</Tabs>
+
+
+#### Viewing Spend w/ custom metadata
+
+#### `/spend/logs` Request Format 
+
+```bash
+curl -X GET "http://0.0.0.0:4000/spend/logs?request_id=<your-call-id" \ # e.g.: chatcmpl-9ZKMURhVYSi9D6r6PJ9vLcayIK0Vm
+-H "Authorization: Bearer sk-1234"
+```
+
+#### `/spend/logs` Response Format
+```bash
+[
+    {
+        "request_id": "chatcmpl-9ZKMURhVYSi9D6r6PJ9vLcayIK0Vm",
+        "call_type": "acompletion",
+        "metadata": {
+            "user_api_key": "88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",
+            "user_api_key_alias": null,
+            "spend_logs_metadata": { # 👈 LOGGED CUSTOM METADATA
+                "hello": "world"
+            },
+            "user_api_key_team_id": null,
+            "user_api_key_user_id": "116544810872468347480",
+            "user_api_key_team_alias": null
+        },
+    }
+]
+```
+
+
+
 ## Enforce Required Params for LLM Requests
 Use this when you want to enforce all requests to include certain params. Example you need all requests to include the `user` and `["metadata]["generation_name"]` params.

--- a/docs/my-website/docs/proxy/logging.md
+++ b/docs/my-website/docs/proxy/logging.md
@ -606,6 +606,52 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \

 ** 🎉 Expect to see this trace logged in your OTEL collector**

+### Context propagation across Services `Traceparent HTTP Header`
+
+❓ Use this when you want to **pass information about the incoming request in a distributed tracing system**
+
+✅ Key change: Pass the **`traceparent` header** in your requests. [Read more about traceparent headers here](https://uptrace.dev/opentelemetry/opentelemetry-traceparent.html#what-is-traceparent-header)
+```curl
+traceparent: 00-80e1afed08e019fc1110464cfa66635c-7a085853722dc6d2-01
+```
+Example Usage
+1. Make Request to LiteLLM Proxy with `traceparent` header
+```python
+import openai
+import uuid
+
+client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000")
+example_traceparent = f"00-80e1afed08e019fc1110464cfa66635c-02e80198930058d4-01"
+extra_headers = {
+    "traceparent": example_traceparent
+}
+_trace_id = example_traceparent.split("-")[1]
+
+print("EXTRA HEADERS: ", extra_headers)
+print("Trace ID: ", _trace_id)
+
+response = client.chat.completions.create(
+    model="llama3",
+    messages=[
+        {"role": "user", "content": "this is a test request, write a short poem"}
+    ],
+    extra_headers=extra_headers,
+)
+
+print(response)
+
+```
+
+```shell
+# EXTRA HEADERS:  {'traceparent': '00-80e1afed08e019fc1110464cfa66635c-02e80198930058d4-01'}
+# Trace ID:  80e1afed08e019fc1110464cfa66635c
+```
+
+2. Lookup Trace ID on OTEL Logger
+
+Search for Trace=`80e1afed08e019fc1110464cfa66635c` on your OTEL Collector
+
+<Image img={require('../../img/otel_parent.png')} />



--- a/docs/my-website/docs/proxy/prod.md
+++ b/docs/my-website/docs/proxy/prod.md
@ -21,6 +21,7 @@ general_settings:

 litellm_settings:
  set_verbose: False      # Switch off Debug Logging, ensure your logs do not have any debugging on
+  json_logs: true         # Get debug logs in json format
 ```

 Set slack webhook url in your env
@ -28,6 +29,11 @@ Set slack webhook url in your env
 export SLACK_WEBHOOK_URL="https://hooks.slack.com/services/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"
 ```

+Turn off FASTAPI's default info logs
+```bash
+export LITELLM_LOG="ERROR"
+```
+
 :::info

 Need Help or want dedicated support ? Talk to a founder [here]: (https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
--- a/docs/my-website/docs/proxy/reliability.md
+++ b/docs/my-website/docs/proxy/reliability.md
@ -2,18 +2,13 @@ import Image from '@theme/IdealImage';
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# 🔥 Fallbacks, Retries, Timeouts, Load Balancing
+# 🔥 Load Balancing, Fallbacks, Retries, Timeouts

-Retry call with multiple instances of the same model.
-
-If a call fails after num_retries, fall back to another model group.
-
-If the error is a context window exceeded error, fall back to a larger model group (if given).
-
-[**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/router.py)
+- Quick Start [load balancing](#test---load-balancing)
+- Quick Start [client side fallbacks](#test---client-side-fallbacks)

 ## Quick Start - Load Balancing
-### Step 1 - Set deployments on config
+#### Step 1 - Set deployments on config

 **Example config below**. Here requests with `model=gpt-3.5-turbo` will be routed across multiple instances of `azure/gpt-3.5-turbo`
 ```yaml
@ -38,50 +33,214 @@ model_list:
      rpm: 1440
 ```

-### Step 2: Start Proxy with config
+#### Step 2: Start Proxy with config

 ```shell
 $ litellm --config /path/to/config.yaml
 ```

-### Step 3: Use proxy - Call a model group [Load Balancing]
-Curl Command
+### Test - Load Balancing
+
+Here requests with model=gpt-3.5-turbo will be routed across multiple instances of azure/gpt-3.5-turbo
+
+👉 Key Change: `model="gpt-3.5-turbo"`
+
+**Check the `model_id` in Response Headers to make sure the requests are being load balanced**
+
+<Tabs>
+
+<TabItem value="openai" label="OpenAI Python v1.0.0+">
+
+```python
+import openai
+client = openai.OpenAI(
+    api_key="anything",
+    base_url="http://0.0.0.0:4000"
+)
+
+response = client.chat.completions.create(
+    model="gpt-3.5-turbo",
+    messages = [
+        {
+            "role": "user",
+            "content": "this is a test request, write a short poem"
+        }
+    ]
+)
+
+print(response)
+```
+</TabItem>
+
+<TabItem value="Curl" label="Curl Request">
+
+Pass `metadata` as part of the request body
+
 ```shell
 curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
-      "model": "gpt-3.5-turbo",
-      "messages": [
+    --header 'Content-Type: application/json' \
+    --data '{
+    "model": "gpt-3.5-turbo",
+    "messages": [
        {
-          "role": "user",
-          "content": "what llm are you"
+        "role": "user",
+        "content": "what llm are you"
        }
-      ],
-    }
-'
+    ]
+}'
+```
+</TabItem>
+<TabItem value="langchain" label="Langchain">
+
+```python
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts.chat import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+    SystemMessagePromptTemplate,
+)
+from langchain.schema import HumanMessage, SystemMessage
+import os 
+
+os.environ["OPENAI_API_KEY"] = "anything"
+
+chat = ChatOpenAI(
+    openai_api_base="http://0.0.0.0:4000",
+    model="gpt-3.5-turbo",
+)
+
+messages = [
+    SystemMessage(
+        content="You are a helpful assistant that im using to make a test request to."
+    ),
+    HumanMessage(
+        content="test from litellm. tell me why it's amazing in 1 sentence"
+    ),
+]
+response = chat(messages)
+
+print(response)
 ```

-### Usage - Call a specific model deployment
-If you want to call a specific model defined in the `config.yaml`, you can call the `litellm_params: model`
+</TabItem>
+
+</Tabs>
+
+
+### Test - Client Side Fallbacks
+In this request the following will occur:
+1. The request to `model="zephyr-beta"` will fail
+2. litellm proxy will loop through all the model_groups specified in `fallbacks=["gpt-3.5-turbo"]`
+3. The request to `model="gpt-3.5-turbo"` will succeed and the client making the request will get a response from gpt-3.5-turbo 
+
+👉 Key Change: `"fallbacks": ["gpt-3.5-turbo"]`
+
+<Tabs>
+
+<TabItem value="openai" label="OpenAI Python v1.0.0+">
+
+```python
+import openai
+client = openai.OpenAI(
+    api_key="anything",
+    base_url="http://0.0.0.0:4000"
+)
+
+response = client.chat.completions.create(
+    model="zephyr-beta",
+    messages = [
+        {
+            "role": "user",
+            "content": "this is a test request, write a short poem"
+        }
+    ],
+    extra_body={
+        "fallbacks": ["gpt-3.5-turbo"]
+    }
+)
+
+print(response)
+```
+</TabItem>
+
+<TabItem value="Curl" label="Curl Request">
+
+Pass `metadata` as part of the request body
+
+```shell
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+    --header 'Content-Type: application/json' \
+    --data '{
+    "model": "zephyr-beta"",
+    "messages": [
+        {
+        "role": "user",
+        "content": "what llm are you"
+        }
+    ],
+    "fallbacks": ["gpt-3.5-turbo"]
+}'
+```
+</TabItem>
+<TabItem value="langchain" label="Langchain">
+
+```python
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts.chat import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+    SystemMessagePromptTemplate,
+)
+from langchain.schema import HumanMessage, SystemMessage
+import os 
+
+os.environ["OPENAI_API_KEY"] = "anything"
+
+chat = ChatOpenAI(
+    openai_api_base="http://0.0.0.0:4000",
+    model="zephyr-beta",
+    extra_body={
+        "fallbacks": ["gpt-3.5-turbo"]
+    }
+)
+
+messages = [
+    SystemMessage(
+        content="You are a helpful assistant that im using to make a test request to."
+    ),
+    HumanMessage(
+        content="test from litellm. tell me why it's amazing in 1 sentence"
+    ),
+]
+response = chat(messages)
+
+print(response)
+```
+
+</TabItem>
+
+</Tabs>
+
+
+
+<!-- 
+### Test it!

-In this example it will call `azure/gpt-turbo-small-ca`. Defined in the config on Step 1

 ```bash
 curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
-      "model": "azure/gpt-turbo-small-ca",
-      "messages": [
-        {
-          "role": "user",
-          "content": "what llm are you"
-        }
-      ],
-    }
-'
-```
+     --header 'Content-Type: application/json' \
+     --data-raw '{
+        "model": "zephyr-beta", # 👈 MODEL NAME to fallback from
+        "messages": [
+            {"role": "user", "content": "what color is red"}
+        ],
+        "mock_testing_fallbacks": true
+     }'
+``` -->

-## Fallbacks + Retries + Timeouts + Cooldowns
+## Advanced
+### Fallbacks + Retries + Timeouts + Cooldowns

 **Set via config**
 ```yaml
@ -114,44 +273,7 @@ litellm_settings:
  context_window_fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}] # fallback to gpt-3.5-turbo-16k if context window error
  allowed_fails: 3 # cooldown model if it fails > 1 call in a minute. 
 ```
-
-**Set dynamically**
-
-```bash
-curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
-      "model": "zephyr-beta",
-      "messages": [
-        {
-          "role": "user",
-          "content": "what llm are you"
-        }
-      ],
-      "fallbacks": [{"zephyr-beta": ["gpt-3.5-turbo"]}],
-      "context_window_fallbacks": [{"zephyr-beta": ["gpt-3.5-turbo"]}],
-      "num_retries": 2,
-      "timeout": 10
-    }
-'
-```
-
-### Test it!
-
-
-```bash
-curl --location 'http://0.0.0.0:4000/chat/completions' \
-     --header 'Content-Type: application/json' \
-     --data-raw '{
-        "model": "zephyr-beta", # 👈 MODEL NAME to fallback from
-        "messages": [
-            {"role": "user", "content": "what color is red"}
-        ],
-        "mock_testing_fallbacks": true
-     }'
-```
-
-## Advanced - Context Window Fallbacks (Pre-Call Checks + Fallbacks)
+### Context Window Fallbacks (Pre-Call Checks + Fallbacks)

 **Before call is made** check if a call is within model context window with  **`enable_pre_call_checks: true`**.

@ -287,7 +409,7 @@ print(response)
 </Tabs>


-## Advanced - EU-Region Filtering (Pre-Call Checks)
+### EU-Region Filtering (Pre-Call Checks)

 **Before call is made** check if a call is within model context window with  **`enable_pre_call_checks: true`**.

@ -350,7 +472,7 @@ print(response)
 print(f"response.headers.get('x-litellm-model-api-base')")
 ```

-## Advanced - Custom Timeouts, Stream Timeouts - Per Model
+### Custom Timeouts, Stream Timeouts - Per Model
 For each model you can set `timeout` & `stream_timeout` under `litellm_params`
 ```yaml
 model_list:
@ -379,7 +501,7 @@ $ litellm --config /path/to/config.yaml
 ```


-## Advanced - Setting Dynamic Timeouts - Per Request
+### Setting Dynamic Timeouts - Per Request

 LiteLLM Proxy supports setting a `timeout` per request 

--- a/docs/my-website/docs/proxy/ui.md
+++ b/docs/my-website/docs/proxy/ui.md
@ -77,6 +77,28 @@ litellm_settings:

 #### Step 2: Setup Oauth Client
 <Tabs>
+<TabItem value="okta" label="Okta SSO">
+
+1. Add Okta credentials to your .env
+
+```bash
+GENERIC_CLIENT_ID = "<your-okta-client-id>"
+GENERIC_CLIENT_SECRET = "<your-okta-client-secret>" 
+GENERIC_AUTHORIZATION_ENDPOINT = "<your-okta-domain>/authorize" # https://dev-2kqkcd6lx6kdkuzt.us.auth0.com/authorize
+GENERIC_TOKEN_ENDPOINT = "<your-okta-domain>/token" # https://dev-2kqkcd6lx6kdkuzt.us.auth0.com/oauth/token
+GENERIC_USERINFO_ENDPOINT = "<your-okta-domain>/userinfo" # https://dev-2kqkcd6lx6kdkuzt.us.auth0.com/userinfo
+```
+
+You can get your domain specific auth/token/userinfo endpoints at `<YOUR-OKTA-DOMAIN>/.well-known/openid-configuration`
+
+2. Add proxy url as callback_url on Okta
+
+On Okta, add the 'callback_url' as `<proxy_base_url>/sso/callback`
+
+
+<Image img={require('../../img/okta_callback_url.png')} />
+
+</TabItem>
 <TabItem value="google" label="Google SSO">

 - Create a new Oauth 2.0 Client on https://console.cloud.google.com/ 
@ -115,7 +137,6 @@ MICROSOFT_TENANT="5a39737

 </TabItem>

-
 <TabItem value="Generic" label="Generic SSO Provider">

 A generic OAuth client that can be used to quickly create support for any OAuth provider with close to no code
--- a/docs/my-website/docs/proxy/users.md
+++ b/docs/my-website/docs/proxy/users.md
@ -63,7 +63,7 @@ You can:
 - Add budgets to Teams


-#### **Add budgets to users**
+#### **Add budgets to teams**
 ```shell 
 curl --location 'http://localhost:4000/team/new' \
 --header 'Authorization: Bearer <your-master-key>' \
@ -102,6 +102,22 @@ curl --location 'http://localhost:4000/team/new' \
    "budget_reset_at": null
 }
 ```
+
+#### **Add budget duration to teams**
+
+`budget_duration`: Budget is reset at the end of specified duration. If not set, budget is never reset. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
+
+```
+curl 'http://0.0.0.0:4000/team/new' \
+--header 'Authorization: Bearer <your-master-key>' \
+--header 'Content-Type: application/json' \
+--data-raw '{
+  "team_alias": "my-new-team_4",
+  "members_with_roles": [{"role": "admin", "user_id": "5c4a0aa3-a1e1-43dc-bd87-3c2da8382a3a"}],
+  "budget_duration": 10s,
+}'
+```
+
 </TabItem>
 <TabItem value="per-team-member" label="For Team Members">

--- a/docs/my-website/img/custom_root_path.png
+++ b/docs/my-website/img/custom_root_path.png
--- a/docs/my-website/img/okta_callback_url.png
+++ b/docs/my-website/img/okta_callback_url.png
--- a/docs/my-website/img/otel_parent.png
+++ b/docs/my-website/img/otel_parent.png
--- a/docs/my-website/img/raw_request_log.png
+++ b/docs/my-website/img/raw_request_log.png
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -183,6 +183,7 @@ const sidebars = {
      label: "Logging & Observability",
      items: [
        "debugging/local_debugging",
+        "observability/raw_request_response",
        "observability/callbacks",
        "observability/custom_callback",
        "observability/langfuse_integration",
@ -256,6 +257,7 @@ const sidebars = {
            "projects/GPT Migrate",
            "projects/YiVal",
            "projects/LiteLLM Proxy",
+            "projects/llm_cord",
          ],
        },
      ],
--- a/litellm/init.py
+++ b/litellm/init.py
@ -60,6 +60,7 @@ _async_failure_callback: List[Callable] = (
 pre_call_rules: List[Callable] = []
 post_call_rules: List[Callable] = []
 turn_off_message_logging: Optional[bool] = False
+log_raw_request_response: bool = False
 redact_messages_in_exceptions: Optional[bool] = False
 store_audit_logs = False  # Enterprise feature, allow users to see audit logs
 ## end of callbacks #############
@ -407,6 +408,7 @@ openai_compatible_providers: List = [
    "together_ai",
    "fireworks_ai",
    "friendliai",
+    "azure_ai",
 ]


@ -611,6 +613,7 @@ provider_list: List = [
    "baseten",
    "azure",
    "azure_text",
+    "azure_ai",
    "sagemaker",
    "bedrock",
    "vllm",
@ -765,7 +768,7 @@ from .llms.gemini import GeminiConfig
 from .llms.nlp_cloud import NLPCloudConfig
 from .llms.aleph_alpha import AlephAlphaConfig
 from .llms.petals import PetalsConfig
-from .llms.vertex_ai import VertexAIConfig
+from .llms.vertex_ai import VertexAIConfig, VertexAITextEmbeddingConfig
 from .llms.vertex_ai_anthropic import VertexAIAnthropicConfig
 from .llms.sagemaker import SagemakerConfig
 from .llms.ollama import OllamaConfig
@ -787,6 +790,7 @@ from .llms.openai import (
    OpenAIConfig,
    OpenAITextCompletionConfig,
    MistralConfig,
+    MistralEmbeddingConfig,
    DeepInfraConfig,
 )
 from .llms.azure import (
--- a/litellm/cost_calculator.py
+++ b/litellm/cost_calculator.py
@ -337,8 +337,6 @@ def response_cost_calculator(
                    and custom_llm_provider is True
                ):  # override defaults if custom pricing is set
                    base_model = model
-                elif base_model is None:
-                    base_model = model
                # base_model defaults to None if not set on model_info
                response_cost = completion_cost(
                    completion_response=response_object,
--- a/litellm/exceptions.py
+++ b/litellm/exceptions.py
@ -337,6 +337,7 @@ class ContextWindowExceededError(BadRequestError):  # type: ignore
            model=self.model,  # type: ignore
            llm_provider=self.llm_provider,  # type: ignore
            response=response,
+            litellm_debug_info=self.litellm_debug_info,
        )  # Call the base class constructor with the parameters it needs

    def __str__(self):
@ -379,6 +380,7 @@ class RejectedRequestError(BadRequestError):  # type: ignore
            model=self.model,  # type: ignore
            llm_provider=self.llm_provider,  # type: ignore
            response=response,
+            litellm_debug_info=self.litellm_debug_info,
        )  # Call the base class constructor with the parameters it needs

    def __str__(self):
@ -418,6 +420,7 @@ class ContentPolicyViolationError(BadRequestError):  # type: ignore
            model=self.model,  # type: ignore
            llm_provider=self.llm_provider,  # type: ignore
            response=response,
+            litellm_debug_info=self.litellm_debug_info,
        )  # Call the base class constructor with the parameters it needs

    def __str__(self):
--- a/litellm/integrations/opentelemetry.py
+++ b/litellm/integrations/opentelemetry.py
@ -6,17 +6,23 @@ import litellm
 from litellm.integrations.custom_logger import CustomLogger
 from litellm._logging import verbose_logger
 from litellm.types.services import ServiceLoggerPayload
+from functools import wraps
 from typing import Union, Optional, TYPE_CHECKING, Any

 if TYPE_CHECKING:
    from opentelemetry.trace import Span as _Span
    from litellm.proxy.proxy_server import UserAPIKeyAuth as _UserAPIKeyAuth
+    from litellm.proxy._types import (
+        ManagementEndpointLoggingPayload as _ManagementEndpointLoggingPayload,
+    )

    Span = _Span
    UserAPIKeyAuth = _UserAPIKeyAuth
+    ManagementEndpointLoggingPayload = _ManagementEndpointLoggingPayload
 else:
    Span = Any
    UserAPIKeyAuth = Any
+    ManagementEndpointLoggingPayload = Any


 LITELLM_TRACER_NAME = os.getenv("OTEL_TRACER_NAME", "litellm")
@ -247,7 +253,7 @@ class OpenTelemetry(CustomLogger):
        span.end(end_time=self._to_ns(end_time))

    def set_tools_attributes(self, span: Span, tools):
-        from opentelemetry.semconv.ai import SpanAttributes
+        from litellm.proxy._types import SpanAttributes
        import json

        if not tools:
@ -272,7 +278,7 @@ class OpenTelemetry(CustomLogger):
            pass

    def set_attributes(self, span: Span, kwargs, response_obj):
-        from opentelemetry.semconv.ai import SpanAttributes
+        from litellm.proxy._types import SpanAttributes

        optional_params = kwargs.get("optional_params", {})
        litellm_params = kwargs.get("litellm_params", {}) or {}
@ -407,7 +413,7 @@ class OpenTelemetry(CustomLogger):
            )

    def set_raw_request_attributes(self, span: Span, kwargs, response_obj):
-        from opentelemetry.semconv.ai import SpanAttributes
+        from litellm.proxy._types import SpanAttributes

        optional_params = kwargs.get("optional_params", {})
        litellm_params = kwargs.get("litellm_params", {}) or {}
@ -454,6 +460,23 @@ class OpenTelemetry(CustomLogger):
    def _get_span_name(self, kwargs):
        return LITELLM_REQUEST_SPAN_NAME

+    def get_traceparent_from_header(self, headers):
+        if headers is None:
+            return None
+        _traceparent = headers.get("traceparent", None)
+        if _traceparent is None:
+            return None
+
+        from opentelemetry.trace.propagation.tracecontext import (
+            TraceContextTextMapPropagator,
+        )
+
+        verbose_logger.debug("OpenTelemetry: GOT A TRACEPARENT {}".format(_traceparent))
+        propagator = TraceContextTextMapPropagator()
+        _parent_context = propagator.extract(carrier={"traceparent": _traceparent})
+        verbose_logger.debug("OpenTelemetry: PARENT CONTEXT {}".format(_parent_context))
+        return _parent_context
+
    def _get_span_context(self, kwargs):
        from opentelemetry.trace.propagation.tracecontext import (
            TraceContextTextMapPropagator,
@ -545,3 +568,91 @@ class OpenTelemetry(CustomLogger):
                self.OTEL_EXPORTER,
            )
            return BatchSpanProcessor(ConsoleSpanExporter())
+
+    async def async_management_endpoint_success_hook(
+        self,
+        logging_payload: ManagementEndpointLoggingPayload,
+        parent_otel_span: Optional[Span] = None,
+    ):
+        from opentelemetry import trace
+        from datetime import datetime
+        from opentelemetry.trace import Status, StatusCode
+
+        _start_time_ns = logging_payload.start_time
+        _end_time_ns = logging_payload.end_time
+
+        start_time = logging_payload.start_time
+        end_time = logging_payload.end_time
+
+        if isinstance(start_time, float):
+            _start_time_ns = int(int(start_time) * 1e9)
+        else:
+            _start_time_ns = self._to_ns(start_time)
+
+        if isinstance(end_time, float):
+            _end_time_ns = int(int(end_time) * 1e9)
+        else:
+            _end_time_ns = self._to_ns(end_time)
+
+        if parent_otel_span is not None:
+            _span_name = logging_payload.route
+            management_endpoint_span = self.tracer.start_span(
+                name=_span_name,
+                context=trace.set_span_in_context(parent_otel_span),
+                start_time=_start_time_ns,
+            )
+
+            _request_data = logging_payload.request_data
+            if _request_data is not None:
+                for key, value in _request_data.items():
+                    management_endpoint_span.set_attribute(f"request.{key}", value)
+
+            _response = logging_payload.response
+            if _response is not None:
+                for key, value in _response.items():
+                    management_endpoint_span.set_attribute(f"response.{key}", value)
+            management_endpoint_span.set_status(Status(StatusCode.OK))
+            management_endpoint_span.end(end_time=_end_time_ns)
+
+    async def async_management_endpoint_failure_hook(
+        self,
+        logging_payload: ManagementEndpointLoggingPayload,
+        parent_otel_span: Optional[Span] = None,
+    ):
+        from opentelemetry import trace
+        from datetime import datetime
+        from opentelemetry.trace import Status, StatusCode
+
+        _start_time_ns = logging_payload.start_time
+        _end_time_ns = logging_payload.end_time
+
+        start_time = logging_payload.start_time
+        end_time = logging_payload.end_time
+
+        if isinstance(start_time, float):
+            _start_time_ns = int(int(start_time) * 1e9)
+        else:
+            _start_time_ns = self._to_ns(start_time)
+
+        if isinstance(end_time, float):
+            _end_time_ns = int(int(end_time) * 1e9)
+        else:
+            _end_time_ns = self._to_ns(end_time)
+
+        if parent_otel_span is not None:
+            _span_name = logging_payload.route
+            management_endpoint_span = self.tracer.start_span(
+                name=_span_name,
+                context=trace.set_span_in_context(parent_otel_span),
+                start_time=_start_time_ns,
+            )
+
+            _request_data = logging_payload.request_data
+            if _request_data is not None:
+                for key, value in _request_data.items():
+                    management_endpoint_span.set_attribute(f"request.{key}", value)
+
+            _exception = logging_payload.exception
+            management_endpoint_span.set_attribute(f"exception", str(_exception))
+            management_endpoint_span.set_status(Status(StatusCode.ERROR))
+            management_endpoint_span.end(end_time=_end_time_ns)
--- a/litellm/llms/azure.py
+++ b/litellm/llms/azure.py
@ -36,6 +36,9 @@ from ..types.llms.openai import (
    AsyncAssistantStreamManager,
    AssistantStreamManager,
 )
+from litellm.caching import DualCache
+
+azure_ad_cache = DualCache()


 class AzureOpenAIError(Exception):
@ -309,9 +312,10 @@ def select_azure_base_url_or_endpoint(azure_client_params: dict):

 def get_azure_ad_token_from_oidc(azure_ad_token: str):
    azure_client_id = os.getenv("AZURE_CLIENT_ID", None)
-    azure_tenant = os.getenv("AZURE_TENANT_ID", None)
+    azure_tenant_id = os.getenv("AZURE_TENANT_ID", None)
+    azure_authority_host = os.getenv("AZURE_AUTHORITY_HOST", "https://login.microsoftonline.com")

-    if azure_client_id is None or azure_tenant is None:
+    if azure_client_id is None or azure_tenant_id is None:
        raise AzureOpenAIError(
            status_code=422,
            message="AZURE_CLIENT_ID and AZURE_TENANT_ID must be set",
@ -325,8 +329,19 @@ def get_azure_ad_token_from_oidc(azure_ad_token: str):
            message="OIDC token could not be retrieved from secret manager.",
        )

+    azure_ad_token_cache_key = json.dumps({
+        "azure_client_id": azure_client_id,
+        "azure_tenant_id": azure_tenant_id,
+        "azure_authority_host": azure_authority_host,
+        "oidc_token": oidc_token,
+    })
+
+    azure_ad_token_access_token = azure_ad_cache.get_cache(azure_ad_token_cache_key)
+    if azure_ad_token_access_token is not None:
+        return azure_ad_token_access_token
+
    req_token = httpx.post(
-        f"https://login.microsoftonline.com/{azure_tenant}/oauth2/v2.0/token",
+        f"{azure_authority_host}/{azure_tenant_id}/oauth2/v2.0/token",
        data={
            "client_id": azure_client_id,
            "grant_type": "client_credentials",
@ -342,12 +357,23 @@ def get_azure_ad_token_from_oidc(azure_ad_token: str):
            message=req_token.text,
        )

-    possible_azure_ad_token = req_token.json().get("access_token", None)
+    azure_ad_token_json = req_token.json()
+    azure_ad_token_access_token = azure_ad_token_json.get("access_token", None)
+    azure_ad_token_expires_in = azure_ad_token_json.get("expires_in", None)

-    if possible_azure_ad_token is None:
-        raise AzureOpenAIError(status_code=422, message="Azure AD Token not returned")
+    if azure_ad_token_access_token is None:
+        raise AzureOpenAIError(
+            status_code=422, message="Azure AD Token access_token not returned"
+        )

-    return possible_azure_ad_token
+    if azure_ad_token_expires_in is None:
+        raise AzureOpenAIError(
+            status_code=422, message="Azure AD Token expires_in not returned"
+        )
+
+    azure_ad_cache.set_cache(key=azure_ad_token_cache_key, value=azure_ad_token_access_token, ttl=azure_ad_token_expires_in)
+
+    return azure_ad_token_access_token


 class AzureChatCompletion(BaseLLM):
--- a/litellm/llms/bedrock_httpx.py
+++ b/litellm/llms/bedrock_httpx.py
@ -51,8 +51,11 @@ from litellm.types.llms.openai import (
    ChatCompletionResponseMessage,
    ChatCompletionToolCallChunk,
    ChatCompletionToolCallFunctionChunk,
+    ChatCompletionDeltaChunk,
 )
+from litellm.caching import DualCache

+iam_cache = DualCache()

 class AmazonCohereChatConfig:
    """
@ -324,38 +327,53 @@ class BedrockLLM(BaseLLM):
        ) = params_to_check

        ### CHECK STS ###
-        if (
-            aws_web_identity_token is not None
-            and aws_role_name is not None
-            and aws_session_name is not None
-        ):
-            oidc_token = get_secret(aws_web_identity_token)
+        if aws_web_identity_token is not None and aws_role_name is not None and aws_session_name is not None:
+            iam_creds_cache_key = json.dumps({
+                "aws_web_identity_token": aws_web_identity_token,
+                "aws_role_name": aws_role_name,
+                "aws_session_name": aws_session_name,
+                "aws_region_name": aws_region_name,
+            })

-            if oidc_token is None:
-                raise BedrockError(
-                    message="OIDC token could not be retrieved from secret manager.",
-                    status_code=401,
+            iam_creds_dict = iam_cache.get_cache(iam_creds_cache_key)
+            if iam_creds_dict is None:
+                oidc_token = get_secret(aws_web_identity_token)
+
+                if oidc_token is None:
+                    raise BedrockError(
+                        message="OIDC token could not be retrieved from secret manager.",
+                        status_code=401,
+                    )
+
+                sts_client = boto3.client(
+                    "sts",
+                    region_name=aws_region_name,
+                    endpoint_url=f"https://sts.{aws_region_name}.amazonaws.com"
                )

-            sts_client = boto3.client("sts")
+                # https://docs.aws.amazon.com/STS/latest/APIReference/API_AssumeRoleWithWebIdentity.html
+                # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sts/client/assume_role_with_web_identity.html
+                sts_response = sts_client.assume_role_with_web_identity(
+                    RoleArn=aws_role_name,
+                    RoleSessionName=aws_session_name,
+                    WebIdentityToken=oidc_token,
+                    DurationSeconds=3600,
+                )

-            # https://docs.aws.amazon.com/STS/latest/APIReference/API_AssumeRoleWithWebIdentity.html
-            # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sts/client/assume_role_with_web_identity.html
-            sts_response = sts_client.assume_role_with_web_identity(
-                RoleArn=aws_role_name,
-                RoleSessionName=aws_session_name,
-                WebIdentityToken=oidc_token,
-                DurationSeconds=3600,
-            )
+                iam_creds_dict = {
+                    "aws_access_key_id": sts_response["Credentials"]["AccessKeyId"],
+                    "aws_secret_access_key": sts_response["Credentials"]["SecretAccessKey"],
+                    "aws_session_token": sts_response["Credentials"]["SessionToken"],
+                    "region_name": aws_region_name,
+                }

-            session = boto3.Session(
-                aws_access_key_id=sts_response["Credentials"]["AccessKeyId"],
-                aws_secret_access_key=sts_response["Credentials"]["SecretAccessKey"],
-                aws_session_token=sts_response["Credentials"]["SessionToken"],
-                region_name=aws_region_name,
-            )
+                iam_cache.set_cache(key=iam_creds_cache_key, value=json.dumps(iam_creds_dict), ttl=3600 - 60)

-            return session.get_credentials()
+            session = boto3.Session(**iam_creds_dict)
+
+            iam_creds = session.get_credentials()
+
+            return iam_creds
        elif aws_role_name is not None and aws_session_name is not None:
            sts_client = boto3.client(
                "sts",
@ -1415,38 +1433,53 @@ class BedrockConverseLLM(BaseLLM):
        ) = params_to_check

        ### CHECK STS ###
-        if (
-            aws_web_identity_token is not None
-            and aws_role_name is not None
-            and aws_session_name is not None
-        ):
-            oidc_token = get_secret(aws_web_identity_token)
+        if aws_web_identity_token is not None and aws_role_name is not None and aws_session_name is not None:
+            iam_creds_cache_key = json.dumps({
+                "aws_web_identity_token": aws_web_identity_token,
+                "aws_role_name": aws_role_name,
+                "aws_session_name": aws_session_name,
+                "aws_region_name": aws_region_name,
+            })

-            if oidc_token is None:
-                raise BedrockError(
-                    message="OIDC token could not be retrieved from secret manager.",
-                    status_code=401,
+            iam_creds_dict = iam_cache.get_cache(iam_creds_cache_key)
+            if iam_creds_dict is None:
+                oidc_token = get_secret(aws_web_identity_token)
+
+                if oidc_token is None:
+                    raise BedrockError(
+                        message="OIDC token could not be retrieved from secret manager.",
+                        status_code=401,
+                    )
+
+                sts_client = boto3.client(
+                    "sts",
+                    region_name=aws_region_name,
+                    endpoint_url=f"https://sts.{aws_region_name}.amazonaws.com"
                )

-            sts_client = boto3.client("sts")
+                # https://docs.aws.amazon.com/STS/latest/APIReference/API_AssumeRoleWithWebIdentity.html
+                # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sts/client/assume_role_with_web_identity.html
+                sts_response = sts_client.assume_role_with_web_identity(
+                    RoleArn=aws_role_name,
+                    RoleSessionName=aws_session_name,
+                    WebIdentityToken=oidc_token,
+                    DurationSeconds=3600,
+                )

-            # https://docs.aws.amazon.com/STS/latest/APIReference/API_AssumeRoleWithWebIdentity.html
-            # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sts/client/assume_role_with_web_identity.html
-            sts_response = sts_client.assume_role_with_web_identity(
-                RoleArn=aws_role_name,
-                RoleSessionName=aws_session_name,
-                WebIdentityToken=oidc_token,
-                DurationSeconds=3600,
-            )
+                iam_creds_dict = {
+                    "aws_access_key_id": sts_response["Credentials"]["AccessKeyId"],
+                    "aws_secret_access_key": sts_response["Credentials"]["SecretAccessKey"],
+                    "aws_session_token": sts_response["Credentials"]["SessionToken"],
+                    "region_name": aws_region_name,
+                }

-            session = boto3.Session(
-                aws_access_key_id=sts_response["Credentials"]["AccessKeyId"],
-                aws_secret_access_key=sts_response["Credentials"]["SecretAccessKey"],
-                aws_session_token=sts_response["Credentials"]["SessionToken"],
-                region_name=aws_region_name,
-            )
+                iam_cache.set_cache(key=iam_creds_cache_key, value=json.dumps(iam_creds_dict), ttl=3600 - 60)

-            return session.get_credentials()
+            session = boto3.Session(**iam_creds_dict)
+
+            iam_creds = session.get_credentials()
+
+            return iam_creds
        elif aws_role_name is not None and aws_session_name is not None:
            sts_client = boto3.client(
                "sts",
@ -1859,29 +1892,59 @@ class AWSEventStreamDecoder:
        self.parser = EventStreamJSONParser()

    def converse_chunk_parser(self, chunk_data: dict) -> GenericStreamingChunk:
-        text = ""
-        tool_str = ""
-        is_finished = False
-        finish_reason = ""
-        usage: Optional[ConverseTokenUsageBlock] = None
-        if "delta" in chunk_data:
-            delta_obj = ContentBlockDeltaEvent(**chunk_data["delta"])
-            if "text" in delta_obj:
-                text = delta_obj["text"]
-            elif "toolUse" in delta_obj:
-                tool_str = delta_obj["toolUse"]["input"]
-        elif "stopReason" in chunk_data:
-            finish_reason = map_finish_reason(chunk_data.get("stopReason", "stop"))
-        elif "usage" in chunk_data:
-            usage = ConverseTokenUsageBlock(**chunk_data["usage"])  # type: ignore
-        response = GenericStreamingChunk(
-            text=text,
-            tool_str=tool_str,
-            is_finished=is_finished,
-            finish_reason=finish_reason,
-            usage=usage,
-        )
-        return response
+        try:
+            text = ""
+            tool_use: Optional[ChatCompletionToolCallChunk] = None
+            is_finished = False
+            finish_reason = ""
+            usage: Optional[ConverseTokenUsageBlock] = None
+
+            index = int(chunk_data.get("contentBlockIndex", 0))
+            if "start" in chunk_data:
+                start_obj = ContentBlockStartEvent(**chunk_data["start"])
+                if (
+                    start_obj is not None
+                    and "toolUse" in start_obj
+                    and start_obj["toolUse"] is not None
+                ):
+                    tool_use = {
+                        "id": start_obj["toolUse"]["toolUseId"],
+                        "type": "function",
+                        "function": {
+                            "name": start_obj["toolUse"]["name"],
+                            "arguments": "",
+                        },
+                    }
+            elif "delta" in chunk_data:
+                delta_obj = ContentBlockDeltaEvent(**chunk_data["delta"])
+                if "text" in delta_obj:
+                    text = delta_obj["text"]
+                elif "toolUse" in delta_obj:
+                    tool_use = {
+                        "id": None,
+                        "type": "function",
+                        "function": {
+                            "name": None,
+                            "arguments": delta_obj["toolUse"]["input"],
+                        },
+                    }
+            elif "stopReason" in chunk_data:
+                finish_reason = map_finish_reason(chunk_data.get("stopReason", "stop"))
+                is_finished = True
+            elif "usage" in chunk_data:
+                usage = ConverseTokenUsageBlock(**chunk_data["usage"])  # type: ignore
+
+            response = GenericStreamingChunk(
+                text=text,
+                tool_use=tool_use,
+                is_finished=is_finished,
+                finish_reason=finish_reason,
+                usage=usage,
+                index=index,
+            )
+            return response
+        except Exception as e:
+            raise Exception("Received streaming error - {}".format(str(e)))

    def _chunk_parser(self, chunk_data: dict) -> GenericStreamingChunk:
        text = ""
@ -1890,12 +1953,16 @@ class AWSEventStreamDecoder:
        if "outputText" in chunk_data:
            text = chunk_data["outputText"]
        # ai21 mapping
-        if "ai21" in self.model:  # fake ai21 streaming
+        elif "ai21" in self.model:  # fake ai21 streaming
            text = chunk_data.get("completions")[0].get("data").get("text")  # type: ignore
            is_finished = True
            finish_reason = "stop"
        ######## bedrock.anthropic mappings ###############
-        elif "delta" in chunk_data:
+        elif (
+            "contentBlockIndex" in chunk_data
+            or "stopReason" in chunk_data
+            or "metrics" in chunk_data
+        ):
            return self.converse_chunk_parser(chunk_data=chunk_data)
        ######## bedrock.mistral mappings ###############
        elif "outputs" in chunk_data:
@ -1905,7 +1972,7 @@ class AWSEventStreamDecoder:
            ):
                text = chunk_data["outputs"][0]["text"]
            stop_reason = chunk_data.get("stop_reason", None)
-            if stop_reason != None:
+            if stop_reason is not None:
                is_finished = True
                finish_reason = stop_reason
        ######## bedrock.cohere mappings ###############
@ -1926,8 +1993,9 @@ class AWSEventStreamDecoder:
            text=text,
            is_finished=is_finished,
            finish_reason=finish_reason,
-            tool_str="",
            usage=None,
+            index=0,
+            tool_use=None,
        )

    def iter_bytes(self, iterator: Iterator[bytes]) -> Iterator[GenericStreamingChunk]:
--- a/litellm/llms/clarifai.py
+++ b/litellm/llms/clarifai.py
@ -139,6 +139,7 @@ def process_response(

 def convert_model_to_url(model: str, api_base: str):
    user_id, app_id, model_id = model.split(".")
+    model_id = model_id.lower()
    return f"{api_base}/users/{user_id}/apps/{app_id}/models/{model_id}/outputs"


@ -171,19 +172,55 @@ async def async_completion(

    async_handler = AsyncHTTPHandler(timeout=httpx.Timeout(timeout=600.0, connect=5.0))
    response = await async_handler.post(
-        api_base, headers=headers, data=json.dumps(data)
+        url=model, headers=headers, data=json.dumps(data)
    )

-    return process_response(
-        model=model,
-        prompt=prompt,
-        response=response,
-        model_response=model_response,
+    logging_obj.post_call(
+        input=prompt,
        api_key=api_key,
-        data=data,
-        encoding=encoding,
-        logging_obj=logging_obj,
+        original_response=response.text,
+        additional_args={"complete_input_dict": data},
    )
+    ## RESPONSE OBJECT
+    try:
+        completion_response = response.json()
+    except Exception:
+        raise ClarifaiError(
+            message=response.text, status_code=response.status_code, url=model
+        )
+    # print(completion_response)
+    try:
+        choices_list = []
+        for idx, item in enumerate(completion_response["outputs"]):
+            if len(item["data"]["text"]["raw"]) > 0:
+                message_obj = Message(content=item["data"]["text"]["raw"])
+            else:
+                message_obj = Message(content=None)
+            choice_obj = Choices(
+                finish_reason="stop",
+                index=idx + 1,  # check
+                message=message_obj,
+            )
+            choices_list.append(choice_obj)
+        model_response["choices"] = choices_list
+
+    except Exception as e:
+        raise ClarifaiError(
+            message=traceback.format_exc(), status_code=response.status_code, url=model
+        )
+
+    # Calculate Usage
+    prompt_tokens = len(encoding.encode(prompt))
+    completion_tokens = len(
+        encoding.encode(model_response["choices"][0]["message"].get("content"))
+    )
+    model_response["model"] = model
+    model_response["usage"] = Usage(
+        prompt_tokens=prompt_tokens,
+        completion_tokens=completion_tokens,
+        total_tokens=prompt_tokens + completion_tokens,
+    )
+    return model_response


 def completion(
@ -241,7 +278,7 @@ def completion(
        additional_args={
            "complete_input_dict": data,
            "headers": headers,
-            "api_base": api_base,
+            "api_base": model,
        },
    )
    if acompletion == True:
--- a/litellm/llms/openai.py
+++ b/litellm/llms/openai.py
@ -164,6 +164,49 @@ class MistralConfig:
        return optional_params


+class MistralEmbeddingConfig:
+    """
+    Reference: https://docs.mistral.ai/api/#operation/createEmbedding
+    """
+
+    def __init__(
+        self,
+    ) -> None:
+        locals_ = locals().copy()
+        for key, value in locals_.items():
+            if key != "self" and value is not None:
+                setattr(self.__class__, key, value)
+
+    @classmethod
+    def get_config(cls):
+        return {
+            k: v
+            for k, v in cls.__dict__.items()
+            if not k.startswith("__")
+            and not isinstance(
+                v,
+                (
+                    types.FunctionType,
+                    types.BuiltinFunctionType,
+                    classmethod,
+                    staticmethod,
+                ),
+            )
+            and v is not None
+        }
+
+    def get_supported_openai_params(self):
+        return [
+            "encoding_format",
+        ]
+
+    def map_openai_params(self, non_default_params: dict, optional_params: dict):
+        for param, value in non_default_params.items():
+            if param == "encoding_format":
+                optional_params["encoding_format"] = value
+        return optional_params
+
+
 class DeepInfraConfig:
    """
    Reference: https://deepinfra.com/docs/advanced/openai_api
--- a/litellm/llms/tokenizers/fb374d419588a4632f3f557e76b4b70aebbca790
+++ b/litellm/llms/tokenizers/fb374d419588a4632f3f557e76b4b70aebbca790
--- a/litellm/llms/vertex_ai.py
+++ b/litellm/llms/vertex_ai.py
@ -4,6 +4,7 @@ from enum import Enum
 import requests  # type: ignore
 import time
 from typing import Callable, Optional, Union, List, Literal, Any
+from pydantic import BaseModel
 from litellm.utils import ModelResponse, Usage, CustomStreamWrapper, map_finish_reason
 import litellm, uuid
 import httpx, inspect  # type: ignore
@ -12,7 +13,12 @@ from litellm.llms.prompt_templates.factory import (
    convert_to_gemini_tool_call_result,
    convert_to_gemini_tool_call_invoke,
 )
-from litellm.types.files import get_file_mime_type_for_file_type, get_file_type_from_extension, is_gemini_1_5_accepted_file_type, is_video_file_type
+from litellm.types.files import (
+    get_file_mime_type_for_file_type,
+    get_file_type_from_extension,
+    is_gemini_1_5_accepted_file_type,
+    is_video_file_type,
+)


 class VertexAIError(Exception):
@ -301,15 +307,15 @@ def _process_gemini_image(image_url: str) -> PartType:
        # GCS URIs
        if "gs://" in image_url:
            # Figure out file type
-            extension_with_dot = os.path.splitext(image_url)[-1] # Ex: ".png"
-            extension = extension_with_dot[1:] # Ex: "png"
+            extension_with_dot = os.path.splitext(image_url)[-1]  # Ex: ".png"
+            extension = extension_with_dot[1:]  # Ex: "png"

            file_type = get_file_type_from_extension(extension)

            # Validate the file type is supported by Gemini
            if not is_gemini_1_5_accepted_file_type(file_type):
                raise Exception(f"File type not supported by gemini - {file_type}")
-            
+
            mime_type = get_file_mime_type_for_file_type(file_type)
            file_data = FileDataType(mime_type=mime_type, file_uri=image_url)

@ -320,7 +326,7 @@ def _process_gemini_image(image_url: str) -> PartType:
            image = _load_image_from_url(image_url)
            _blob = BlobType(data=image.data, mime_type=image._mime_type)
            return PartType(inline_data=_blob)
-        
+
        # Base64 encoding
        elif "base64" in image_url:
            import base64, re
@ -1293,6 +1299,95 @@ async def async_streaming(
    return streamwrapper


+class VertexAITextEmbeddingConfig(BaseModel):
+    """
+    Reference: https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/text-embeddings-api#TextEmbeddingInput
+
+    Args:
+        auto_truncate: Optional(bool) If True, will truncate input text to fit within the model's max input length.
+        task_type: Optional(str) The type of task to be performed. The default is "RETRIEVAL_QUERY".
+        title: Optional(str) The title of the document to be embedded. (only valid with task_type=RETRIEVAL_DOCUMENT).
+    """
+
+    auto_truncate: Optional[bool] = None
+    task_type: Optional[
+        Literal[
+            "RETRIEVAL_QUERY",
+            "RETRIEVAL_DOCUMENT",
+            "SEMANTIC_SIMILARITY",
+            "CLASSIFICATION",
+            "CLUSTERING",
+            "QUESTION_ANSWERING",
+            "FACT_VERIFICATION",
+        ]
+    ] = None
+    title: Optional[str] = None
+
+    def __init__(
+        self,
+        auto_truncate: Optional[bool] = None,
+        task_type: Optional[
+            Literal[
+                "RETRIEVAL_QUERY",
+                "RETRIEVAL_DOCUMENT",
+                "SEMANTIC_SIMILARITY",
+                "CLASSIFICATION",
+                "CLUSTERING",
+                "QUESTION_ANSWERING",
+                "FACT_VERIFICATION",
+            ]
+        ] = None,
+        title: Optional[str] = None,
+    ) -> None:
+        locals_ = locals()
+        for key, value in locals_.items():
+            if key != "self" and value is not None:
+                setattr(self.__class__, key, value)
+
+    @classmethod
+    def get_config(cls):
+        return {
+            k: v
+            for k, v in cls.__dict__.items()
+            if not k.startswith("__")
+            and not isinstance(
+                v,
+                (
+                    types.FunctionType,
+                    types.BuiltinFunctionType,
+                    classmethod,
+                    staticmethod,
+                ),
+            )
+            and v is not None
+        }
+
+    def get_supported_openai_params(self):
+        return [
+            "dimensions",
+        ]
+
+    def map_openai_params(self, non_default_params: dict, optional_params: dict):
+        for param, value in non_default_params.items():
+            if param == "dimensions":
+                optional_params["output_dimensionality"] = value
+        return optional_params
+
+    def get_mapped_special_auth_params(self) -> dict:
+        """
+        Common auth params across bedrock/vertex_ai/azure/watsonx
+        """
+        return {"project": "vertex_project", "region_name": "vertex_location"}
+
+    def map_special_auth_params(self, non_default_params: dict, optional_params: dict):
+        mapped_params = self.get_mapped_special_auth_params()
+
+        for param, value in non_default_params.items():
+            if param in mapped_params:
+                optional_params[mapped_params[param]] = value
+        return optional_params
+
+
 def embedding(
    model: str,
    input: Union[list, str],
@ -1316,7 +1411,7 @@ def embedding(
            message="vertexai import failed please run `pip install google-cloud-aiplatform`",
        )

-    from vertexai.language_models import TextEmbeddingModel
+    from vertexai.language_models import TextEmbeddingModel, TextEmbeddingInput
    import google.auth  # type: ignore

    ## Load credentials with the correct quota project ref: https://github.com/googleapis/python-aiplatform/issues/2557#issuecomment-1709284744
@ -1347,6 +1442,16 @@ def embedding(
    if isinstance(input, str):
        input = [input]

+    if optional_params is not None and isinstance(optional_params, dict):
+        if optional_params.get("task_type") or optional_params.get("title"):
+            # if user passed task_type or title, cast to TextEmbeddingInput
+            _task_type = optional_params.pop("task_type", None)
+            _title = optional_params.pop("title", None)
+            input = [
+                TextEmbeddingInput(text=x, task_type=_task_type, title=_title)
+                for x in input
+            ]
+
    try:
        llm_model = TextEmbeddingModel.from_pretrained(model)
    except Exception as e:
@ -1363,7 +1468,8 @@ def embedding(
            encoding=encoding,
        )

-    request_str = f"""embeddings = llm_model.get_embeddings({input})"""
+    _input_dict = {"texts": input, **optional_params}
+    request_str = f"""embeddings = llm_model.get_embeddings({_input_dict})"""
    ## LOGGING PRE-CALL
    logging_obj.pre_call(
        input=input,
@ -1375,7 +1481,7 @@ def embedding(
    )

    try:
-        embeddings = llm_model.get_embeddings(input)
+        embeddings = llm_model.get_embeddings(**_input_dict)
    except Exception as e:
        raise VertexAIError(status_code=500, message=str(e))

@ -1383,6 +1489,7 @@ def embedding(
    logging_obj.post_call(input=input, api_key=None, original_response=embeddings)
    ## Populate OpenAI compliant dictionary
    embedding_response = []
+    input_tokens: int = 0
    for idx, embedding in enumerate(embeddings):
        embedding_response.append(
            {
@ -1391,14 +1498,10 @@ def embedding(
                "embedding": embedding.values,
            }
        )
+        input_tokens += embedding.statistics.token_count
    model_response["object"] = "list"
    model_response["data"] = embedding_response
    model_response["model"] = model
-    input_tokens = 0
-
-    input_str = "".join(input)
-
-    input_tokens += len(encoding.encode(input_str))

    usage = Usage(
        prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
@ -1420,7 +1523,8 @@ async def async_embedding(
    """
    Async embedding implementation
    """
-    request_str = f"""embeddings = llm_model.get_embeddings({input})"""
+    _input_dict = {"texts": input, **optional_params}
+    request_str = f"""embeddings = llm_model.get_embeddings({_input_dict})"""
    ## LOGGING PRE-CALL
    logging_obj.pre_call(
        input=input,
@ -1432,7 +1536,7 @@ async def async_embedding(
    )

    try:
-        embeddings = await client.get_embeddings_async(input)
+        embeddings = await client.get_embeddings_async(**_input_dict)
    except Exception as e:
        raise VertexAIError(status_code=500, message=str(e))

@ -1440,6 +1544,7 @@ async def async_embedding(
    logging_obj.post_call(input=input, api_key=None, original_response=embeddings)
    ## Populate OpenAI compliant dictionary
    embedding_response = []
+    input_tokens: int = 0
    for idx, embedding in enumerate(embeddings):
        embedding_response.append(
            {
@ -1448,18 +1553,13 @@ async def async_embedding(
                "embedding": embedding.values,
            }
        )
+        input_tokens += embedding.statistics.token_count
+
    model_response["object"] = "list"
    model_response["data"] = embedding_response
    model_response["model"] = model
-    input_tokens = 0
-
-    input_str = "".join(input)
-
-    input_tokens += len(encoding.encode(input_str))
-
    usage = Usage(
        prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
    )
    model_response.usage = usage
-
    return model_response
--- a/litellm/main.py
+++ b/litellm/main.py
@ -11,10 +11,10 @@ import os, openai, sys, json, inspect, uuid, datetime, threading
 from typing import Any, Literal, Union, BinaryIO
 from typing_extensions import overload
 from functools import partial
+
 import dotenv, traceback, random, asyncio, time, contextvars
 from copy import deepcopy
 import httpx
-
 import litellm
 from ._logging import verbose_logger
 from litellm import (  # type: ignore
@ -335,6 +335,7 @@ async def acompletion(
            or custom_llm_provider == "predibase"
            or custom_llm_provider == "bedrock"
            or custom_llm_provider == "databricks"
+            or custom_llm_provider == "clarifai"
            or custom_llm_provider in litellm.openai_compatible_providers
        ):  # currently implemented aiohttp calls for just azure, openai, hf, ollama, vertex ai soon all.
            init_response = await loop.run_in_executor(None, func_with_context)
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -1387,6 +1387,26 @@
        "mode": "image_generation",
        "source": "https://cloud.google.com/vertex-ai/generative-ai/pricing"
    },
+    "text-embedding-004": {
+        "max_tokens": 3072,
+        "max_input_tokens": 3072,
+        "output_vector_size": 768,
+        "input_cost_per_token": 0.00000000625,
+        "output_cost_per_token": 0,
+        "litellm_provider": "vertex_ai-embedding-models",
+        "mode": "embedding",
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models"
+    },
+    "text-multilingual-embedding-002": {
+        "max_tokens": 2048,
+        "max_input_tokens": 2048,
+        "output_vector_size": 768,
+        "input_cost_per_token": 0.00000000625,
+        "output_cost_per_token": 0,
+        "litellm_provider": "vertex_ai-embedding-models",
+        "mode": "embedding",
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models"
+    },
    "textembedding-gecko": {
        "max_tokens": 3072,
        "max_input_tokens": 3072,
--- a/litellm/proxy/_experimental/out/_next/static/48nWsJi-LJrUlOLzcK-Yz/_buildManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/48nWsJi-LJrUlOLzcK-Yz/_buildManifest.js
--- a/litellm/proxy/_experimental/out/_next/static/48nWsJi-LJrUlOLzcK-Yz/_ssgManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/48nWsJi-LJrUlOLzcK-Yz/_ssgManifest.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/page-bd882aee817406ff.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/page-bd882aee817406ff.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/page-d61796ff0d3a8faf.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/page-d61796ff0d3a8faf.js
--- a/litellm/proxy/_experimental/out/index.html
+++ b/litellm/proxy/_experimental/out/index.html
@ -1 +1 @@
-<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-d12f0c7c134d3e60.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-f960ab1e6d32b002.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-04708d7d4a17c1ee.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-d12f0c7c134d3e60.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/f02cb03d96e276ef.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[45980,[\"936\",\"static/chunks/2f6dbc85-052c4579f80d66ae.js\",\"294\",\"static/chunks/294-0e35509d5ca95267.js\",\"131\",\"static/chunks/131-6a03368053f9d26d.js\",\"684\",\"static/chunks/684-bb2d2f93d92acb0b.js\",\"759\",\"static/chunks/759-83a8bdddfe32b5d9.js\",\"777\",\"static/chunks/777-17b0c91edd3a24fe.js\",\"931\",\"static/chunks/app/page-d61796ff0d3a8faf.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/f02cb03d96e276ef.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"tghLG7_IS7i5OkQJRvCIl\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
+<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-d12f0c7c134d3e60.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-f960ab1e6d32b002.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-04708d7d4a17c1ee.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-d12f0c7c134d3e60.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/f02cb03d96e276ef.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[45980,[\"936\",\"static/chunks/2f6dbc85-052c4579f80d66ae.js\",\"294\",\"static/chunks/294-0e35509d5ca95267.js\",\"131\",\"static/chunks/131-6a03368053f9d26d.js\",\"684\",\"static/chunks/684-bb2d2f93d92acb0b.js\",\"759\",\"static/chunks/759-83a8bdddfe32b5d9.js\",\"777\",\"static/chunks/777-17b0c91edd3a24fe.js\",\"931\",\"static/chunks/app/page-bd882aee817406ff.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/f02cb03d96e276ef.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"48nWsJi-LJrUlOLzcK-Yz\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
--- a/litellm/proxy/_experimental/out/index.txt
+++ b/litellm/proxy/_experimental/out/index.txt
@ -1,7 +1,7 @@
 2:I[77831,[],""]
-3:I[45980,["936","static/chunks/2f6dbc85-052c4579f80d66ae.js","294","static/chunks/294-0e35509d5ca95267.js","131","static/chunks/131-6a03368053f9d26d.js","684","static/chunks/684-bb2d2f93d92acb0b.js","759","static/chunks/759-83a8bdddfe32b5d9.js","777","static/chunks/777-17b0c91edd3a24fe.js","931","static/chunks/app/page-d61796ff0d3a8faf.js"],""]
+3:I[45980,["936","static/chunks/2f6dbc85-052c4579f80d66ae.js","294","static/chunks/294-0e35509d5ca95267.js","131","static/chunks/131-6a03368053f9d26d.js","684","static/chunks/684-bb2d2f93d92acb0b.js","759","static/chunks/759-83a8bdddfe32b5d9.js","777","static/chunks/777-17b0c91edd3a24fe.js","931","static/chunks/app/page-bd882aee817406ff.js"],""]
 4:I[5613,[],""]
 5:I[31778,[],""]
-0:["tghLG7_IS7i5OkQJRvCIl",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/f02cb03d96e276ef.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
+0:["48nWsJi-LJrUlOLzcK-Yz",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/f02cb03d96e276ef.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
 6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
 1:null
--- a/litellm/proxy/_experimental/out/model_hub.txt
+++ b/litellm/proxy/_experimental/out/model_hub.txt
@ -2,6 +2,6 @@
 3:I[87494,["294","static/chunks/294-0e35509d5ca95267.js","131","static/chunks/131-6a03368053f9d26d.js","777","static/chunks/777-17b0c91edd3a24fe.js","418","static/chunks/app/model_hub/page-4cb65c32467214b5.js"],""]
 4:I[5613,[],""]
 5:I[31778,[],""]
-0:["tghLG7_IS7i5OkQJRvCIl",[[["",{"children":["model_hub",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["model_hub",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","model_hub","children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined","styles":null}]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/f02cb03d96e276ef.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
+0:["48nWsJi-LJrUlOLzcK-Yz",[[["",{"children":["model_hub",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["model_hub",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","model_hub","children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined","styles":null}]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/f02cb03d96e276ef.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
 6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
 1:null
--- a/litellm/proxy/_experimental/out/onboarding.txt
+++ b/litellm/proxy/_experimental/out/onboarding.txt
@ -2,6 +2,6 @@
 3:I[667,["665","static/chunks/3014691f-589a5f4865c3822f.js","294","static/chunks/294-0e35509d5ca95267.js","684","static/chunks/684-bb2d2f93d92acb0b.js","777","static/chunks/777-17b0c91edd3a24fe.js","461","static/chunks/app/onboarding/page-664c7288e11fff5a.js"],""]
 4:I[5613,[],""]
 5:I[31778,[],""]
-0:["tghLG7_IS7i5OkQJRvCIl",[[["",{"children":["onboarding",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["onboarding",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","onboarding","children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined","styles":null}]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/f02cb03d96e276ef.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
+0:["48nWsJi-LJrUlOLzcK-Yz",[[["",{"children":["onboarding",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["onboarding",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","onboarding","children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined","styles":null}]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/f02cb03d96e276ef.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
 6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
 1:null
--- a/litellm/proxy/_logging.py
+++ b/litellm/proxy/_logging.py
@ -1,7 +1,12 @@
 import json
 import logging
 from logging import Formatter
-import sys
+import os
+from litellm import json_logs
+
+# Set default log level to INFO
+log_level = os.getenv("LITELLM_LOG", "INFO")
+numeric_level: str = getattr(logging, log_level.upper())


 class JsonFormatter(Formatter):
@ -16,6 +21,14 @@ class JsonFormatter(Formatter):

 logger = logging.root
 handler = logging.StreamHandler()
-handler.setFormatter(JsonFormatter())
+if json_logs:
+    handler.setFormatter(JsonFormatter())
+else:
+    formatter = logging.Formatter(
+        "\033[92m%(asctime)s - %(name)s:%(levelname)s\033[0m: %(filename)s:%(lineno)s - %(message)s",
+        datefmt="%H:%M:%S",
+    )
+
+    handler.setFormatter(formatter)
 logger.handlers = [handler]
-logger.setLevel(logging.INFO)
+logger.setLevel(numeric_level)
--- a/litellm/proxy/_types.py
+++ b/litellm/proxy/_types.py
@ -719,6 +719,8 @@ class Member(LiteLLMBase):
    @model_validator(mode="before")
    @classmethod
    def check_user_info(cls, values):
+        if not isinstance(values, dict):
+            raise ValueError("input needs to be a dictionary")
        if values.get("user_id") is None and values.get("user_email") is None:
            raise ValueError("Either user id or user email must be provided")
        return values
@ -757,9 +759,24 @@ class GlobalEndUsersSpend(LiteLLMBase):

 class TeamMemberAddRequest(LiteLLMBase):
    team_id: str
-    member: Member
+    member: Union[List[Member], Member]
    max_budget_in_team: Optional[float] = None  # Users max budget within the team

+    def __init__(self, **data):
+        member_data = data.get("member")
+        if isinstance(member_data, list):
+            # If member is a list of dictionaries, convert each dictionary to a Member object
+            members = [Member(**item) for item in member_data]
+            # Replace member_data with the list of Member objects
+            data["member"] = members
+        elif isinstance(member_data, dict):
+            # If member is a dictionary, convert it to a single Member object
+            member = Member(**member_data)
+            # Replace member_data with the single Member object
+            data["member"] = member
+        # Call the superclass __init__ method to initialize the object
+        super().__init__(**data)
+

 class TeamMemberDeleteRequest(LiteLLMBase):
    team_id: str
@ -1472,6 +1489,9 @@ class SpendLogsMetadata(TypedDict):
    user_api_key_team_id: Optional[str]
    user_api_key_user_id: Optional[str]
    user_api_key_team_alias: Optional[str]
+    spend_logs_metadata: Optional[
+        dict
+    ]  # special param to log k,v pairs to spendlogs for a call


 class SpendLogsPayload(TypedDict):
@ -1496,3 +1516,60 @@ class SpendLogsPayload(TypedDict):
    request_tags: str  # json str
    team_id: Optional[str]
    end_user: Optional[str]
+
+
+class SpanAttributes(str, enum.Enum):
+    # Note: We've taken this from opentelemetry-semantic-conventions-ai
+    # I chose to not add a new dependency to litellm for this
+
+    # Semantic Conventions for LLM requests, this needs to be removed after
+    # OpenTelemetry Semantic Conventions support Gen AI.
+    # Issue at https://github.com/open-telemetry/opentelemetry-python/issues/3868
+    # Refer to https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/llm-spans.md
+
+    LLM_SYSTEM = "gen_ai.system"
+    LLM_REQUEST_MODEL = "gen_ai.request.model"
+    LLM_REQUEST_MAX_TOKENS = "gen_ai.request.max_tokens"
+    LLM_REQUEST_TEMPERATURE = "gen_ai.request.temperature"
+    LLM_REQUEST_TOP_P = "gen_ai.request.top_p"
+    LLM_PROMPTS = "gen_ai.prompt"
+    LLM_COMPLETIONS = "gen_ai.completion"
+    LLM_RESPONSE_MODEL = "gen_ai.response.model"
+    LLM_USAGE_COMPLETION_TOKENS = "gen_ai.usage.completion_tokens"
+    LLM_USAGE_PROMPT_TOKENS = "gen_ai.usage.prompt_tokens"
+    LLM_TOKEN_TYPE = "gen_ai.token.type"
+    # To be added
+    # LLM_RESPONSE_FINISH_REASON = "gen_ai.response.finish_reasons"
+    # LLM_RESPONSE_ID = "gen_ai.response.id"
+
+    # LLM
+    LLM_REQUEST_TYPE = "llm.request.type"
+    LLM_USAGE_TOTAL_TOKENS = "llm.usage.total_tokens"
+    LLM_USAGE_TOKEN_TYPE = "llm.usage.token_type"
+    LLM_USER = "llm.user"
+    LLM_HEADERS = "llm.headers"
+    LLM_TOP_K = "llm.top_k"
+    LLM_IS_STREAMING = "llm.is_streaming"
+    LLM_FREQUENCY_PENALTY = "llm.frequency_penalty"
+    LLM_PRESENCE_PENALTY = "llm.presence_penalty"
+    LLM_CHAT_STOP_SEQUENCES = "llm.chat.stop_sequences"
+    LLM_REQUEST_FUNCTIONS = "llm.request.functions"
+    LLM_REQUEST_REPETITION_PENALTY = "llm.request.repetition_penalty"
+    LLM_RESPONSE_FINISH_REASON = "llm.response.finish_reason"
+    LLM_RESPONSE_STOP_REASON = "llm.response.stop_reason"
+    LLM_CONTENT_COMPLETION_CHUNK = "llm.content.completion.chunk"
+
+    # OpenAI
+    LLM_OPENAI_RESPONSE_SYSTEM_FINGERPRINT = "gen_ai.openai.system_fingerprint"
+    LLM_OPENAI_API_BASE = "gen_ai.openai.api_base"
+    LLM_OPENAI_API_VERSION = "gen_ai.openai.api_version"
+    LLM_OPENAI_API_TYPE = "gen_ai.openai.api_type"
+
+
+class ManagementEndpointLoggingPayload(LiteLLMBase):
+    route: str
+    request_data: dict
+    response: Optional[dict] = None
+    exception: Optional[Any] = None
+    start_time: Optional[datetime] = None
+    end_time: Optional[datetime] = None
--- a/litellm/proxy/auth/auth_checks.py
+++ b/litellm/proxy/auth/auth_checks.py
@ -151,8 +151,8 @@ def common_checks(
        and route != "/models"
    ):
        if global_proxy_spend > litellm.max_budget:
-            raise Exception(
-                f"ExceededBudget: LiteLLM Proxy has exceeded its budget. Current spend: {global_proxy_spend}; Max Budget: {litellm.max_budget}"
+            raise litellm.BudgetExceededError(
+                current_cost=global_proxy_spend, max_budget=litellm.max_budget
            )
    return True

--- a/litellm/proxy/common_utils/http_parsing_utils.py
+++ b/litellm/proxy/common_utils/http_parsing_utils.py
@ -0,0 +1,31 @@
+from typing import Optional
+from fastapi import Request
+import ast, json
+
+
+async def _read_request_body(request: Optional[Request]) -> dict:
+    """
+    Asynchronous function to read the request body and parse it as JSON or literal data.
+
+    Parameters:
+    - request: The request object to read the body from
+
+    Returns:
+    - dict: Parsed request data as a dictionary
+    """
+    try:
+        request_data: dict = {}
+        if request is None:
+            return request_data
+        body = await request.body()
+
+        if body == b"" or body is None:
+            return request_data
+        body_str = body.decode()
+        try:
+            request_data = ast.literal_eval(body_str)
+        except:
+            request_data = json.loads(body_str)
+        return request_data
+    except:
+        return {}
--- a/litellm/proxy/common_utils/management_endpoint_utils.py
+++ b/litellm/proxy/common_utils/management_endpoint_utils.py
@ -0,0 +1,90 @@
+from datetime import datetime
+from functools import wraps
+from litellm.proxy._types import UserAPIKeyAuth, ManagementEndpointLoggingPayload
+from litellm.proxy.common_utils.http_parsing_utils import _read_request_body
+from fastapi import Request
+
+
+def management_endpoint_wrapper(func):
+    """
+    This wrapper does the following:
+
+    1. Log I/O, Exceptions to OTEL
+    2. Create an Audit log for success calls
+    """
+
+    @wraps(func)
+    async def wrapper(*args, **kwargs):
+        start_time = datetime.now()
+
+        try:
+            result = await func(*args, **kwargs)
+            end_time = datetime.now()
+
+            if kwargs is None:
+                kwargs = {}
+            user_api_key_dict: UserAPIKeyAuth = (
+                kwargs.get("user_api_key_dict") or UserAPIKeyAuth()
+            )
+            parent_otel_span = user_api_key_dict.parent_otel_span
+            if parent_otel_span is not None:
+                from litellm.proxy.proxy_server import open_telemetry_logger
+
+                if open_telemetry_logger is not None:
+                    _http_request: Request = kwargs.get("http_request")
+
+                    _route = _http_request.url.path
+                    _request_body: dict = await _read_request_body(
+                        request=_http_request
+                    )
+                    _response = dict(result) if result is not None else None
+
+                    logging_payload = ManagementEndpointLoggingPayload(
+                        route=_route,
+                        request_data=_request_body,
+                        response=_response,
+                        start_time=start_time,
+                        end_time=end_time,
+                    )
+
+                    await open_telemetry_logger.async_management_endpoint_success_hook(
+                        logging_payload=logging_payload,
+                        parent_otel_span=parent_otel_span,
+                    )
+
+            return result
+        except Exception as e:
+            end_time = datetime.now()
+
+            if kwargs is None:
+                kwargs = {}
+            user_api_key_dict: UserAPIKeyAuth = (
+                kwargs.get("user_api_key_dict") or UserAPIKeyAuth()
+            )
+            parent_otel_span = user_api_key_dict.parent_otel_span
+            if parent_otel_span is not None:
+                from litellm.proxy.proxy_server import open_telemetry_logger
+
+                if open_telemetry_logger is not None:
+                    _http_request: Request = kwargs.get("http_request")
+                    _route = _http_request.url.path
+                    _request_body: dict = await _read_request_body(
+                        request=_http_request
+                    )
+                    logging_payload = ManagementEndpointLoggingPayload(
+                        route=_route,
+                        request_data=_request_body,
+                        response=None,
+                        start_time=start_time,
+                        end_time=end_time,
+                        exception=e,
+                    )
+
+                    await open_telemetry_logger.async_management_endpoint_failure_hook(
+                        logging_payload=logging_payload,
+                        parent_otel_span=parent_otel_span,
+                    )
+
+            raise e
+
+    return wrapper
--- a/litellm/proxy/litellm_pre_call_utils.py
+++ b/litellm/proxy/litellm_pre_call_utils.py
@ -79,10 +79,6 @@ async def add_litellm_data_to_request(
                    data["cache"][k] = v

    verbose_proxy_logger.debug("receiving data: %s", data)
-    # users can pass in 'user' param to /chat/completions. Don't override it
-    if data.get("user", None) is None and user_api_key_dict.user_id is not None:
-        # if users are using user_api_key_auth, set `user` in `data`
-        data["user"] = user_api_key_dict.user_id

    if "metadata" not in data:
        data["metadata"] = {}
--- a/litellm/proxy/management_helpers/utils.py
+++ b/litellm/proxy/management_helpers/utils.py
@ -0,0 +1,63 @@
+# What is this?
+## Helper utils for the management endpoints (keys/users/teams)
+
+from litellm.proxy._types import LiteLLM_TeamTable, Member, UserAPIKeyAuth
+from litellm.proxy.utils import PrismaClient
+import uuid
+from typing import Optional
+
+
+async def add_new_member(
+    new_member: Member,
+    max_budget_in_team: Optional[float],
+    prisma_client: PrismaClient,
+    team_id: str,
+    user_api_key_dict: UserAPIKeyAuth,
+    litellm_proxy_admin_name: str,
+):
+    """
+    Add a new member to a team
+
+    - add team id to user table
+    - add team member w/ budget to team member table
+    """
+    ## ADD TEAM ID, to USER TABLE IF NEW ##
+    if new_member.user_id is not None:
+        await prisma_client.db.litellm_usertable.update(
+            where={"user_id": new_member.user_id},
+            data={"teams": {"push": [team_id]}},
+        )
+    elif new_member.user_email is not None:
+        user_data = {"user_id": str(uuid.uuid4()), "user_email": new_member.user_email}
+        ## user email is not unique acc. to prisma schema -> future improvement
+        ### for now: check if it exists in db, if not - insert it
+        existing_user_row = await prisma_client.get_data(
+            key_val={"user_email": new_member.user_email},
+            table_name="user",
+            query_type="find_all",
+        )
+        if existing_user_row is None or (
+            isinstance(existing_user_row, list) and len(existing_user_row) == 0
+        ):
+
+            await prisma_client.insert_data(data=user_data, table_name="user")
+
+    # Check if trying to set a budget for team member
+    if max_budget_in_team is not None and new_member.user_id is not None:
+        # create a new budget item for this member
+        response = await prisma_client.db.litellm_budgettable.create(
+            data={
+                "max_budget": max_budget_in_team,
+                "created_by": user_api_key_dict.user_id or litellm_proxy_admin_name,
+                "updated_by": user_api_key_dict.user_id or litellm_proxy_admin_name,
+            }
+        )
+
+        _budget_id = response.budget_id
+        await prisma_client.db.litellm_teammembership.create(
+            data={
+                "team_id": team_id,
+                "user_id": new_member.user_id,
+                "budget_id": _budget_id,
+            }
+        )
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@ -14,10 +14,9 @@ model_list:
    litellm_params:
      model: openai/*
      api_key: os.environ/OPENAI_API_KEY
-  - model_name: my-triton-model
+  - model_name: mistral-embed
    litellm_params:
-      model: triton/any"
-      api_base: https://exampleopenaiendpoint-production.up.railway.app/triton/embeddings
+      model: mistral/mistral-embed

 general_settings:
  master_key: sk-1234
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -90,6 +90,7 @@ from litellm.types.llms.openai import (
    HttpxBinaryResponseContent,
 )
 from litellm.proxy.litellm_pre_call_utils import add_litellm_data_to_request
+from litellm.proxy.management_helpers.utils import add_new_member
 from litellm.proxy.utils import (
    PrismaClient,
    DBClient,
@ -102,7 +103,6 @@ from litellm.proxy.utils import (
    hash_token,
    html_form,
    missing_keys_html_form,
-    _read_request_body,
    _is_valid_team_configs,
    _is_user_proxy_admin,
    _get_user_role,
@ -114,6 +114,8 @@ from litellm.proxy.utils import (
    _to_ns,
    get_error_message_str,
 )
+from litellm.proxy.common_utils.http_parsing_utils import _read_request_body
+
 from litellm import (
    CreateBatchRequest,
    RetrieveBatchRequest,
@ -160,6 +162,10 @@ from litellm.proxy.auth.auth_checks import (
    get_user_object,
    allowed_routes_check,
    get_actual_routes,
+    log_to_opentelemetry,
+)
+from litellm.proxy.common_utils.management_endpoint_utils import (
+    management_endpoint_wrapper,
 )
 from litellm.llms.custom_httpx.httpx_handler import HTTPHandler
 from litellm.exceptions import RejectedRequestError
@ -368,6 +374,11 @@ from typing import Dict
 api_key_header = APIKeyHeader(
    name="Authorization", auto_error=False, description="Bearer token"
 )
+azure_api_key_header = APIKeyHeader(
+    name="API-Key",
+    auto_error=False,
+    description="Some older versions of the openai Python package will send an API-Key header with just the API key ",
+)
 user_api_base = None
 user_model = None
 user_debug = False
@ -508,18 +519,27 @@ async def check_request_disconnection(request: Request, llm_api_call_task):


 async def user_api_key_auth(
-    request: Request, api_key: str = fastapi.Security(api_key_header)
+    request: Request,
+    api_key: str = fastapi.Security(api_key_header),
+    azure_api_key_header: str = fastapi.Security(azure_api_key_header),
 ) -> UserAPIKeyAuth:
    global master_key, prisma_client, llm_model_list, user_custom_auth, custom_db_client, general_settings, proxy_logging_obj
    try:
        if isinstance(api_key, str):
            passed_in_key = api_key
            api_key = _get_bearer_token(api_key=api_key)
+
+        elif isinstance(azure_api_key_header, str):
+            api_key = azure_api_key_header
+
        parent_otel_span: Optional[Span] = None
        if open_telemetry_logger is not None:
            parent_otel_span = open_telemetry_logger.tracer.start_span(
                name="Received Proxy Server Request",
                start_time=_to_ns(datetime.now()),
+                context=open_telemetry_logger.get_traceparent_from_header(
+                    headers=request.headers
+                ),
            )
        ### USER-DEFINED AUTH FUNCTION ###
        if user_custom_auth is not None:
@ -1062,8 +1082,9 @@ async def user_api_key_auth(

                                _user_id = _user.get("user_id", None)
                                if user_current_spend > user_max_budget:
-                                    raise Exception(
-                                        f"ExceededBudget: User {_user_id} has exceeded their budget. Current spend: {user_current_spend}; Max Budget: {user_max_budget}"
+                                    raise litellm.BudgetExceededError(
+                                        current_cost=user_current_spend,
+                                        max_budget=user_max_budget,
                                    )
                    else:
                        # Token exists, not expired now check if its in budget for the user
@ -1094,9 +1115,11 @@ async def user_api_key_auth(
                            )

                            if user_current_spend > user_max_budget:
-                                raise Exception(
-                                    f"ExceededBudget: User {valid_token.user_id} has exceeded their budget. Current spend: {user_current_spend}; Max Budget: {user_max_budget}"
+                                raise litellm.BudgetExceededError(
+                                    current_cost=user_current_spend,
+                                    max_budget=user_max_budget,
                                )
+
            # Check 3. Check if user is in their team budget
            if valid_token.team_member_spend is not None:
                if prisma_client is not None:
@ -1130,8 +1153,9 @@ async def user_api_key_auth(
                        )
                        if team_member_budget is not None and team_member_budget > 0:
                            if valid_token.team_member_spend > team_member_budget:
-                                raise Exception(
-                                    f"ExceededBudget: Crossed spend within team. UserID: {valid_token.user_id}, in team {valid_token.team_id} has exceeded their budget. Current spend: {valid_token.team_member_spend}; Max Budget: {team_member_budget}"
+                                raise litellm.BudgetExceededError(
+                                    current_cost=valid_token.team_member_spend,
+                                    max_budget=team_member_budget,
                                )

            # Check 3. If token is expired
@ -1189,8 +1213,9 @@ async def user_api_key_auth(
                ####################################

                if valid_token.spend >= valid_token.max_budget:
-                    raise Exception(
-                        f"ExceededTokenBudget: Current spend for token: {valid_token.spend}; Max Budget for Token: {valid_token.max_budget}"
+                    raise litellm.BudgetExceededError(
+                        current_cost=valid_token.spend,
+                        max_budget=valid_token.max_budget,
                    )

            # Check 5. Token Model Spend is under Model budget
@ -1226,8 +1251,9 @@ async def user_api_key_auth(
                    ):
                        current_model_spend = model_spend[0]["_sum"]["spend"]
                        current_model_budget = max_budget_per_model[current_model]
-                        raise Exception(
-                            f"ExceededModelBudget: Current spend for model: {current_model_spend}; Max Budget for Model: {current_model_budget}"
+                        raise litellm.BudgetExceededError(
+                            current_cost=current_model_spend,
+                            max_budget=current_model_budget,
                        )

            # Check 6. Team spend is under Team budget
@ -1251,8 +1277,9 @@ async def user_api_key_auth(
                )

                if valid_token.team_spend >= valid_token.team_max_budget:
-                    raise Exception(
-                        f"ExceededTokenBudget: Current Team Spend: {valid_token.team_spend}; Max Budget for Team: {valid_token.team_max_budget}"
+                    raise litellm.BudgetExceededError(
+                        current_cost=valid_token.team_spend,
+                        max_budget=valid_token.team_max_budget,
                    )

            # Check 8: Additional Common Checks across jwt + key auth
@ -1495,7 +1522,7 @@ async def user_api_key_auth(
                    )
        if valid_token is None:
            # No token was found when looking up in the DB
-            raise Exception("Invalid token passed")
+            raise Exception("Invalid proxy server token passed")
        if valid_token_dict is not None:
            if user_id_information is not None and _is_user_proxy_admin(
                user_id_information
@ -1528,6 +1555,14 @@ async def user_api_key_auth(
                str(e)
            )
        )
+
+        # Log this exception to OTEL
+        if open_telemetry_logger is not None:
+            await open_telemetry_logger.async_post_call_failure_hook(
+                original_exception=e,
+                user_api_key_dict=UserAPIKeyAuth(parent_otel_span=parent_otel_span),
+            )
+
        verbose_proxy_logger.debug(traceback.format_exc())
        if isinstance(e, litellm.BudgetExceededError):
            raise ProxyException(
@ -7803,6 +7838,10 @@ async def get_global_spend_report(
        default=None,
        description="Time till which to view spend",
    ),
+    group_by: Optional[Literal["team", "customer"]] = fastapi.Query(
+        default="team",
+        description="Group spend by internal team or customer",
+    ),
 ):
    """
    Get Daily Spend per Team, based on specific startTime and endTime. Per team, view usage by each key, model
@ -7849,69 +7888,130 @@ async def get_global_spend_report(
                f"Database not connected. Connect a database to your proxy - https://docs.litellm.ai/docs/simple_proxy#managing-auth---virtual-keys"
            )

-        # first get data from spend logs -> SpendByModelApiKey
-        # then read data from "SpendByModelApiKey" to format the response obj
-        sql_query = """
+        if group_by == "team":
+            # first get data from spend logs -> SpendByModelApiKey
+            # then read data from "SpendByModelApiKey" to format the response obj
+            sql_query = """

-        WITH SpendByModelApiKey AS (
-            SELECT
-                date_trunc('day', sl."startTime") AS group_by_day,
-                COALESCE(tt.team_alias, 'Unassigned Team') AS team_name,
-                sl.model,
-                sl.api_key,
-                SUM(sl.spend) AS model_api_spend,
-                SUM(sl.total_tokens) AS model_api_tokens
-            FROM 
-                "LiteLLM_SpendLogs" sl
-            LEFT JOIN 
-                "LiteLLM_TeamTable" tt 
-            ON 
-                sl.team_id = tt.team_id
-            WHERE
-                sl."startTime" BETWEEN $1::date AND $2::date
-            GROUP BY
-                date_trunc('day', sl."startTime"),
-                tt.team_alias,
-                sl.model,
-                sl.api_key
-        )
+            WITH SpendByModelApiKey AS (
+                SELECT
+                    date_trunc('day', sl."startTime") AS group_by_day,
+                    COALESCE(tt.team_alias, 'Unassigned Team') AS team_name,
+                    sl.model,
+                    sl.api_key,
+                    SUM(sl.spend) AS model_api_spend,
+                    SUM(sl.total_tokens) AS model_api_tokens
+                FROM 
+                    "LiteLLM_SpendLogs" sl
+                LEFT JOIN 
+                    "LiteLLM_TeamTable" tt 
+                ON 
+                    sl.team_id = tt.team_id
+                WHERE
+                    sl."startTime" BETWEEN $1::date AND $2::date
+                GROUP BY
+                    date_trunc('day', sl."startTime"),
+                    tt.team_alias,
+                    sl.model,
+                    sl.api_key
+            )
+                SELECT
+                    group_by_day,
+                    jsonb_agg(jsonb_build_object(
+                        'team_name', team_name,
+                        'total_spend', total_spend,
+                        'metadata', metadata
+                    )) AS teams
+                FROM (
+                    SELECT
+                        group_by_day,
+                        team_name,
+                        SUM(model_api_spend) AS total_spend,
+                        jsonb_agg(jsonb_build_object(
+                            'model', model,
+                            'api_key', api_key,
+                            'spend', model_api_spend,
+                            'total_tokens', model_api_tokens
+                        )) AS metadata
+                    FROM 
+                        SpendByModelApiKey
+                    GROUP BY
+                        group_by_day,
+                        team_name
+                ) AS aggregated
+                GROUP BY
+                    group_by_day
+                ORDER BY
+                    group_by_day;
+                """
+
+            db_response = await prisma_client.db.query_raw(
+                sql_query, start_date_obj, end_date_obj
+            )
+            if db_response is None:
+                return []
+
+            return db_response
+
+        elif group_by == "customer":
+            sql_query = """
+
+            WITH SpendByModelApiKey AS (
+                SELECT
+                    date_trunc('day', sl."startTime") AS group_by_day,
+                    sl.end_user AS customer,
+                    sl.model,
+                    sl.api_key,
+                    SUM(sl.spend) AS model_api_spend,
+                    SUM(sl.total_tokens) AS model_api_tokens
+                FROM
+                    "LiteLLM_SpendLogs" sl
+                WHERE
+                    sl."startTime" BETWEEN $1::date AND $2::date
+                GROUP BY
+                    date_trunc('day', sl."startTime"),
+                    customer,
+                    sl.model,
+                    sl.api_key
+            )
            SELECT
                group_by_day,
                jsonb_agg(jsonb_build_object(
-                    'team_name', team_name,
+                    'customer', customer,
                    'total_spend', total_spend,
                    'metadata', metadata
-                )) AS teams
-            FROM (
-                SELECT
-                    group_by_day,
-                    team_name,
-                    SUM(model_api_spend) AS total_spend,
-                    jsonb_agg(jsonb_build_object(
-                        'model', model,
-                        'api_key', api_key,
-                        'spend', model_api_spend,
-                        'total_tokens', model_api_tokens
-                    )) AS metadata
-                FROM 
-                    SpendByModelApiKey
-                GROUP BY
-                    group_by_day,
-                    team_name
-            ) AS aggregated
+                )) AS customers
+            FROM
+                (
+                    SELECT
+                        group_by_day,
+                        customer,
+                        SUM(model_api_spend) AS total_spend,
+                        jsonb_agg(jsonb_build_object(
+                            'model', model,
+                            'api_key', api_key,
+                            'spend', model_api_spend,
+                            'total_tokens', model_api_tokens
+                        )) AS metadata
+                    FROM
+                        SpendByModelApiKey
+                    GROUP BY
+                        group_by_day,
+                        customer
+                ) AS aggregated
            GROUP BY
                group_by_day
            ORDER BY
                group_by_day;
-            """
+                """

-        db_response = await prisma_client.db.query_raw(
-            sql_query, start_date_obj, end_date_obj
-        )
-        if db_response is None:
-            return []
+            db_response = await prisma_client.db.query_raw(
+                sql_query, start_date_obj, end_date_obj
+            )
+            if db_response is None:
+                return []

-        return db_response
+            return db_response

    except Exception as e:
        raise HTTPException(
@ -8097,7 +8197,9 @@ async def _get_spend_report_for_time_range(

        return response, spend_per_tag
    except Exception as e:
-        verbose_proxy_logger.error("Exception in _get_daily_spend_reports", e)  # noqa
+        verbose_proxy_logger.error(
+            "Exception in _get_daily_spend_reports {}".format(str(e))
+        )  # noqa


@router.post(
@ -8755,7 +8857,7 @@ async def new_user(data: NewUserRequest):
    - organization_id: Optional[str] - specify the org a user belongs to.
    - user_email: Optional[str] - Specify a user email.
    - send_invite_email: Optional[bool] - Specify if an invite email should be sent.
-    - user_role: Optional[str] - Specify a user role - "admin", "app_owner", "app_user"
+    - user_role: Optional[str] - Specify a user role - "proxy_admin", "proxy_admin_viewer", "internal_user", "internal_user_viewer", "team", "customer". Info about each role here: `https://github.com/BerriAI/litellm/litellm/proxy/_types.py#L20`
    - max_budget: Optional[float] - Specify max budget for a given user.
    - models: Optional[list] - Model_name's a user is allowed to call. (if empty, key is allowed to call all models)
    - tpm_limit: Optional[int] - Specify tpm limit for a given user (Tokens per minute)
@ -8790,7 +8892,10 @@ async def new_user(data: NewUserRequest):
                    role="user",
                    user_email=data_json.get("user_email", None),
                ),
-            )
+            ),
+            http_request=Request(
+                scope={"type": "http"},
+            ),
        )

    if data.send_invite_email is True:
@ -9823,8 +9928,10 @@ async def delete_end_user(
    dependencies=[Depends(user_api_key_auth)],
    response_model=LiteLLM_TeamTable,
 )
+@management_endpoint_wrapper
 async def new_team(
    data: NewTeamRequest,
+    http_request: Request,
    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
    litellm_changed_by: Optional[str] = Header(
        None,
@ -10058,6 +10165,7 @@ async def create_audit_log_for_update(request_data: LiteLLM_AuditLogs):
@router.post(
    "/team/update", tags=["team management"], dependencies=[Depends(user_api_key_auth)]
 )
+@management_endpoint_wrapper
 async def update_team(
    data: UpdateTeamRequest,
    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
@ -10163,8 +10271,10 @@ async def update_team(
    tags=["team management"],
    dependencies=[Depends(user_api_key_auth)],
 )
+@management_endpoint_wrapper
 async def team_member_add(
    data: TeamMemberAddRequest,
+    http_request: Request,
    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
 ):
    """
@ -10190,10 +10300,12 @@ async def team_member_add(
        raise HTTPException(status_code=400, detail={"error": "No team id passed in"})

    if data.member is None:
-        raise HTTPException(status_code=400, detail={"error": "No member passed in"})
+        raise HTTPException(
+            status_code=400, detail={"error": "No member/members passed in"}
+        )

-    existing_team_row = await prisma_client.get_data(  # type: ignore
-        team_id=data.team_id, table_name="team", query_type="find_unique"
+    existing_team_row = await prisma_client.db.litellm_teamtable.find_unique(
+        where={"team_id": data.team_id}
    )
    if existing_team_row is None:
        raise HTTPException(
@ -10203,75 +10315,50 @@ async def team_member_add(
            },
        )

-    new_member = data.member
+    complete_team_data = LiteLLM_TeamTable(**existing_team_row.model_dump())

-    existing_team_row.members_with_roles.append(new_member)
+    if isinstance(data.member, Member):
+        # add to team db
+        new_member = data.member

-    complete_team_data = LiteLLM_TeamTable(
-        **_get_pydantic_json_dict(existing_team_row),
+        complete_team_data.members_with_roles.append(new_member)
+
+    elif isinstance(data.member, List):
+        # add to team db
+        new_members = data.member
+
+        complete_team_data.members_with_roles.extend(new_members)
+
+    # ADD MEMBER TO TEAM
+    _db_team_members = [m.model_dump() for m in complete_team_data.members_with_roles]
+    updated_team = await prisma_client.db.litellm_teamtable.update(
+        where={"team_id": data.team_id},
+        data={"members_with_roles": json.dumps(_db_team_members)},  # type: ignore
    )

-    team_row = await prisma_client.update_data(
-        update_key_values=complete_team_data.json(exclude_none=True),
-        data=complete_team_data.json(exclude_none=True),
-        table_name="team",
-        team_id=data.team_id,
-    )
-
-    ## ADD USER, IF NEW ##
-    user_data = {  # type: ignore
-        "teams": [team_row["team_id"]],
-        "models": team_row["data"].models,
-    }
-    if new_member.user_id is not None:
-        user_data["user_id"] = new_member.user_id  # type: ignore
-        await prisma_client.update_data(
-            user_id=new_member.user_id,
-            data=user_data,
-            update_key_values_custom_query={
-                "teams": {
-                    "push": [team_row["team_id"]],
-                }
-            },
-            table_name="user",
+    if isinstance(data.member, Member):
+        await add_new_member(
+            new_member=data.member,
+            max_budget_in_team=data.max_budget_in_team,
+            prisma_client=prisma_client,
+            user_api_key_dict=user_api_key_dict,
+            litellm_proxy_admin_name=litellm_proxy_admin_name,
+            team_id=data.team_id,
        )
-    elif new_member.user_email is not None:
-        user_data["user_id"] = str(uuid.uuid4())
-        user_data["user_email"] = new_member.user_email
-        ## user email is not unique acc. to prisma schema -> future improvement
-        ### for now: check if it exists in db, if not - insert it
-        existing_user_row = await prisma_client.get_data(
-            key_val={"user_email": new_member.user_email},
-            table_name="user",
-            query_type="find_all",
-        )
-        if existing_user_row is None or (
-            isinstance(existing_user_row, list) and len(existing_user_row) == 0
-        ):
+    elif isinstance(data.member, List):
+        tasks: List = []
+        for m in data.member:
+            await add_new_member(
+                new_member=m,
+                max_budget_in_team=data.max_budget_in_team,
+                prisma_client=prisma_client,
+                user_api_key_dict=user_api_key_dict,
+                litellm_proxy_admin_name=litellm_proxy_admin_name,
+                team_id=data.team_id,
+            )
+        await asyncio.gather(*tasks)

-            await prisma_client.insert_data(data=user_data, table_name="user")
-
-    # Check if trying to set a budget for team member
-    if data.max_budget_in_team is not None and new_member.user_id is not None:
-        # create a new budget item for this member
-        response = await prisma_client.db.litellm_budgettable.create(
-            data={
-                "max_budget": data.max_budget_in_team,
-                "created_by": user_api_key_dict.user_id or litellm_proxy_admin_name,
-                "updated_by": user_api_key_dict.user_id or litellm_proxy_admin_name,
-            }
-        )
-
-        _budget_id = response.budget_id
-        await prisma_client.db.litellm_teammembership.create(
-            data={
-                "team_id": data.team_id,
-                "user_id": new_member.user_id,
-                "budget_id": _budget_id,
-            }
-        )
-
-    return team_row
+    return updated_team


@router.post(
@ -10279,8 +10366,10 @@ async def team_member_add(
    tags=["team management"],
    dependencies=[Depends(user_api_key_auth)],
 )
+@management_endpoint_wrapper
 async def team_member_delete(
    data: TeamMemberDeleteRequest,
+    http_request: Request,
    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
 ):
    """
@ -10384,8 +10473,10 @@ async def team_member_delete(
@router.post(
    "/team/delete", tags=["team management"], dependencies=[Depends(user_api_key_auth)]
 )
+@management_endpoint_wrapper
 async def delete_team(
    data: DeleteTeamRequest,
+    http_request: Request,
    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
    litellm_changed_by: Optional[str] = Header(
        None,
@ -10469,10 +10560,12 @@ async def delete_team(
@router.get(
    "/team/info", tags=["team management"], dependencies=[Depends(user_api_key_auth)]
 )
+@management_endpoint_wrapper
 async def team_info(
+    http_request: Request,
    team_id: str = fastapi.Query(
        default=None, description="Team ID in the request parameters"
-    )
+    ),
 ):
    """
    get info on team + related keys
@ -10556,8 +10649,10 @@ async def team_info(
@router.post(
    "/team/block", tags=["team management"], dependencies=[Depends(user_api_key_auth)]
 )
+@management_endpoint_wrapper
 async def block_team(
    data: BlockTeamRequest,
+    http_request: Request,
    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
 ):
    """
@ -10578,8 +10673,10 @@ async def block_team(
@router.post(
    "/team/unblock", tags=["team management"], dependencies=[Depends(user_api_key_auth)]
 )
+@management_endpoint_wrapper
 async def unblock_team(
    data: BlockTeamRequest,
+    http_request: Request,
    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
 ):
    """
@ -10600,7 +10697,9 @@ async def unblock_team(
@router.get(
    "/team/list", tags=["team management"], dependencies=[Depends(user_api_key_auth)]
 )
+@management_endpoint_wrapper
 async def list_team(
+    http_request: Request,
    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
 ):
    """
@ -13007,7 +13106,9 @@ async def auth_callback(request: Request):
        user_role = getattr(result, generic_user_role_attribute_name, None)

    if user_id is None:
-        user_id = getattr(result, "first_name", "") + getattr(result, "last_name", "")
+        _first_name = getattr(result, "first_name", "") or ""
+        _last_name = getattr(result, "last_name", "") or ""
+        user_id = _first_name + _last_name

    user_info = None
    user_id_models: List = []
--- a/litellm/proxy/schema.prisma
+++ b/litellm/proxy/schema.prisma
@ -91,7 +91,7 @@ model LiteLLM_TeamTable {
    updated_at    DateTime               @default(now()) @updatedAt @map("updated_at")
    model_spend      Json @default("{}")
    model_max_budget Json @default("{}")
-    model_id Int? @unique
+    model_id Int? @unique // id for LiteLLM_ModelTable -> stores team-level model aliases
    litellm_organization_table LiteLLM_OrganizationTable?   @relation(fields: [organization_id], references: [organization_id])
    litellm_model_table LiteLLM_ModelTable? @relation(fields: [model_id], references: [id])
 }
--- a/litellm/proxy/tests/test_openai_request_with_traceparent.py
+++ b/litellm/proxy/tests/test_openai_request_with_traceparent.py
@ -0,0 +1,41 @@
+# mypy: ignore-errors
+import openai
+from opentelemetry import trace
+from opentelemetry.context import Context
+from opentelemetry.trace import SpanKind
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import SimpleSpanProcessor
+from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
+from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator
+
+
+trace.set_tracer_provider(TracerProvider())
+memory_exporter = InMemorySpanExporter()
+span_processor = SimpleSpanProcessor(memory_exporter)
+trace.get_tracer_provider().add_span_processor(span_processor)
+tracer = trace.get_tracer(__name__)
+
+# create an otel traceparent header
+tracer = trace.get_tracer(__name__)
+with tracer.start_as_current_span("ishaan-local-dev-app") as span:
+    span.set_attribute("generation_name", "ishaan-generation-openai-client")
+    client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000")
+    extra_headers = {}
+    context = trace.set_span_in_context(span)
+    traceparent = TraceContextTextMapPropagator()
+    traceparent.inject(carrier=extra_headers, context=context)
+    print("EXTRA HEADERS: ", extra_headers)
+    _trace_parent = extra_headers.get("traceparent")
+    trace_id = _trace_parent.split("-")[1]
+    print("Trace ID: ", trace_id)
+
+    # # request sent to model set on litellm proxy, `litellm --model`
+    response = client.chat.completions.create(
+        model="llama3",
+        messages=[
+            {"role": "user", "content": "this is a test request, write a short poem"}
+        ],
+        extra_headers=extra_headers,
+    )
+
+    print(response)
--- a/litellm/proxy/tests/test_simple_traceparent_openai.py
+++ b/litellm/proxy/tests/test_simple_traceparent_openai.py
@ -0,0 +1,21 @@
+# mypy: ignore-errors
+import openai
+import uuid
+
+client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000")
+example_traceparent = f"00-80e1afed08e019fc1110464cfa66635c-02e80198930058d4-01"
+extra_headers = {"traceparent": example_traceparent}
+_trace_id = example_traceparent.split("-")[1]
+
+print("EXTRA HEADERS: ", extra_headers)
+print("Trace ID: ", _trace_id)
+
+response = client.chat.completions.create(
+    model="llama3",
+    messages=[
+        {"role": "user", "content": "this is a test request, write a short poem"}
+    ],
+    extra_headers=extra_headers,
+)
+
+print(response)
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@ -48,6 +48,7 @@ from datetime import datetime, timedelta
 from litellm.integrations.slack_alerting import SlackAlerting
 from typing_extensions import overload
 from functools import wraps
+from fastapi import Request

 if TYPE_CHECKING:
    from opentelemetry.trace import Span as _Span
@ -2017,6 +2018,7 @@ def get_logging_payload(
        user_api_key_team_id=None,
        user_api_key_user_id=None,
        user_api_key_team_alias=None,
+        spend_logs_metadata=None,
    )
    if isinstance(metadata, dict):
        verbose_proxy_logger.debug(
@ -2595,36 +2597,6 @@ async def update_spend(
                raise e


-async def _read_request_body(request):
-    """
-    Asynchronous function to read the request body and parse it as JSON or literal data.
-
-    Parameters:
-    - request: The request object to read the body from
-
-    Returns:
-    - dict: Parsed request data as a dictionary
-    """
-    import ast, json
-
-    try:
-        request_data = {}
-        if request is None:
-            return request_data
-        body = await request.body()
-
-        if body == b"" or body is None:
-            return request_data
-        body_str = body.decode()
-        try:
-            request_data = ast.literal_eval(body_str)
-        except:
-            request_data = json.loads(body_str)
-        return request_data
-    except:
-        return {}
-
-
 def _is_projected_spend_over_limit(
    current_spend: float, soft_budget_limit: Optional[float]
 ):
--- a/litellm/router.py
+++ b/litellm/router.py
@ -2057,11 +2057,14 @@ class Router:
                    generic_fallback_idx: Optional[int] = None
                    ## check for specific model group-specific fallbacks
                    for idx, item in enumerate(fallbacks):
-                        if list(item.keys())[0] == model_group:
-                            fallback_model_group = item[model_group]
-                            break
-                        elif list(item.keys())[0] == "*":
-                            generic_fallback_idx = idx
+                        if isinstance(item, dict):
+                            if list(item.keys())[0] == model_group:
+                                fallback_model_group = item[model_group]
+                                break
+                            elif list(item.keys())[0] == "*":
+                                generic_fallback_idx = idx
+                        elif isinstance(item, str):
+                            fallback_model_group = [fallbacks.pop(idx)]
                    ## if none, check for generic fallback
                    if (
                        fallback_model_group is None
@ -2310,13 +2313,15 @@ class Router:
                    verbose_router_logger.debug(f"inside model fallbacks: {fallbacks}")
                    fallback_model_group = None
                    generic_fallback_idx: Optional[int] = None
-                    ## check for specific model group-specific fallbacks
                    for idx, item in enumerate(fallbacks):
-                        if list(item.keys())[0] == model_group:
-                            fallback_model_group = item[model_group]
-                            break
-                        elif list(item.keys())[0] == "*":
-                            generic_fallback_idx = idx
+                        if isinstance(item, dict):
+                            if list(item.keys())[0] == model_group:
+                                fallback_model_group = item[model_group]
+                                break
+                            elif list(item.keys())[0] == "*":
+                                generic_fallback_idx = idx
+                        elif isinstance(item, str):
+                            fallback_model_group = [fallbacks.pop(idx)]
                    ## if none, check for generic fallback
                    if (
                        fallback_model_group is None
--- a/litellm/tests/test_amazing_vertex_completion.py
+++ b/litellm/tests/test_amazing_vertex_completion.py
@ -810,6 +810,28 @@ def test_vertexai_embedding():
        pytest.fail(f"Error occurred: {e}")


+def test_vertexai_embedding_embedding_latest():
+    try:
+        load_vertex_ai_credentials()
+        litellm.set_verbose = True
+
+        response = embedding(
+            model="vertex_ai/text-embedding-004",
+            input=["hi"],
+            dimensions=1,
+            auto_truncate=True,
+            task_type="RETRIEVAL_QUERY",
+        )
+
+        assert len(response.data[0]["embedding"]) == 1
+        assert response.usage.prompt_tokens > 0
+        print(f"response:", response)
+    except litellm.RateLimitError as e:
+        pass
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+
@pytest.mark.asyncio
 async def test_vertexai_aembedding():
    try:
--- a/litellm/tests/test_bedrock_completion.py
+++ b/litellm/tests/test_bedrock_completion.py
@ -220,13 +220,13 @@ def test_completion_bedrock_claude_sts_oidc_auth():
    aws_web_identity_token = "oidc/circleci_v2/"
    aws_region_name = os.environ["AWS_REGION_NAME"]
    # aws_role_name = os.environ["AWS_TEMP_ROLE_NAME"]
-    # TODO: This is using David's IAM role, we should use Litellm's IAM role eventually
+    # TODO: This is using ai.moda's IAM role, we should use LiteLLM's IAM role eventually
    aws_role_name = "arn:aws:iam::335785316107:role/litellm-github-unit-tests-circleci"

    try:
        litellm.set_verbose = True

-        response = completion(
+        response_1 = completion(
            model="bedrock/anthropic.claude-3-haiku-20240307-v1:0",
            messages=messages,
            max_tokens=10,
@ -236,8 +236,40 @@ def test_completion_bedrock_claude_sts_oidc_auth():
            aws_role_name=aws_role_name,
            aws_session_name="my-test-session",
        )
-        # Add any assertions here to check the response
-        print(response)
+        print(response_1)
+        assert len(response_1.choices) > 0
+        assert len(response_1.choices[0].message.content) > 0
+
+        # This second call is to verify that the cache isn't breaking anything
+        response_2 = completion(
+            model="bedrock/anthropic.claude-3-haiku-20240307-v1:0",
+            messages=messages,
+            max_tokens=5,
+            temperature=0.2,
+            aws_region_name=aws_region_name,
+            aws_web_identity_token=aws_web_identity_token,
+            aws_role_name=aws_role_name,
+            aws_session_name="my-test-session",
+        )
+        print(response_2)
+        assert len(response_2.choices) > 0
+        assert len(response_2.choices[0].message.content) > 0
+
+        # This third call is to verify that the cache isn't used for a different region
+        response_3 = completion(
+            model="bedrock/anthropic.claude-3-haiku-20240307-v1:0",
+            messages=messages,
+            max_tokens=6,
+            temperature=0.3,
+            aws_region_name="us-east-1",
+            aws_web_identity_token=aws_web_identity_token,
+            aws_role_name=aws_role_name,
+            aws_session_name="my-test-session",
+        )
+        print(response_3)
+        assert len(response_3.choices) > 0
+        assert len(response_3.choices[0].message.content) > 0
+
    except RateLimitError:
        pass
    except Exception as e:
@ -255,7 +287,7 @@ def test_completion_bedrock_httpx_command_r_sts_oidc_auth():
    aws_web_identity_token = "oidc/circleci_v2/"
    aws_region_name = os.environ["AWS_REGION_NAME"]
    # aws_role_name = os.environ["AWS_TEMP_ROLE_NAME"]
-    # TODO: This is using David's IAM role, we should use Litellm's IAM role eventually
+    # TODO: This is using ai.moda's IAM role, we should use LiteLLM's IAM role eventually
    aws_role_name = "arn:aws:iam::335785316107:role/litellm-github-unit-tests-circleci"

    try:
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -16,7 +16,7 @@ from litellm.llms.prompt_templates.factory import anthropic_messages_pt
 from unittest.mock import patch, MagicMock
 from litellm.llms.custom_httpx.http_handler import HTTPHandler, AsyncHTTPHandler

-# litellm.num_retries=3
+# litellm.num_retries = 3
 litellm.cache = None
 litellm.success_callback = []
 user_message = "Write a short poem about the sky"
@ -114,6 +114,27 @@ def test_null_role_response():
        assert response.choices[0].message.role == "assistant"


+def test_completion_azure_ai_command_r():
+    try:
+        import os
+
+        litellm.set_verbose = True
+
+        os.environ["AZURE_AI_API_BASE"] = os.getenv("AZURE_COHERE_API_BASE", "")
+        os.environ["AZURE_AI_API_KEY"] = os.getenv("AZURE_COHERE_API_KEY", "")
+
+        response: litellm.ModelResponse = completion(
+            model="azure_ai/command-r-plus",
+            messages=[{"role": "user", "content": "What is the meaning of life?"}],
+        )  # type: ignore
+
+        assert "azure_ai" in response.model
+    except litellm.Timeout as e:
+        pass
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+
 def test_completion_azure_command_r():
    try:
        litellm.set_verbose = True
@ -530,6 +551,7 @@ def test_completion_cohere_command_r_plus_function_call():
            messages=messages,
            tools=tools,
            tool_choice="auto",
+            force_single_step=True,
        )
        print(second_response)
    except Exception as e:
@ -720,7 +742,11 @@ def test_completion_claude_3_function_plus_image():
    print(response)


-def test_completion_azure_mistral_large_function_calling():
+@pytest.mark.parametrize(
+    "provider",
+    ["azure", "azure_ai"],
+)
+def test_completion_azure_mistral_large_function_calling(provider):
    """
    This primarily tests if the 'Function()' pydantic object correctly handles argument param passed in as a dict vs. string
    """
@ -751,8 +777,9 @@ def test_completion_azure_mistral_large_function_calling():
            "content": "What's the weather like in Boston today in Fahrenheit?",
        }
    ]
+
    response = completion(
-        model="azure/mistral-large-latest",
+        model="{}/mistral-large-latest".format(provider),
        api_base=os.getenv("AZURE_MISTRAL_API_BASE"),
        api_key=os.getenv("AZURE_MISTRAL_API_KEY"),
        messages=messages,
--- a/litellm/tests/test_custom_logger.py
+++ b/litellm/tests/test_custom_logger.py
@ -34,14 +34,15 @@ class MyCustomHandler(CustomLogger):
        self.response_cost = 0

    def log_pre_api_call(self, model, messages, kwargs):
-        print(f"Pre-API Call")
+        print("Pre-API Call")
+        traceback.print_stack()
        self.data_sent_to_api = kwargs["additional_args"].get("complete_input_dict", {})

    def log_post_api_call(self, kwargs, response_obj, start_time, end_time):
-        print(f"Post-API Call")
+        print("Post-API Call")

    def log_stream_event(self, kwargs, response_obj, start_time, end_time):
-        print(f"On Stream")
+        print("On Stream")

    def log_success_event(self, kwargs, response_obj, start_time, end_time):
        print(f"On Success")
@ -372,6 +373,7 @@ async def test_async_custom_handler_embedding_optional_param():
    Tests if the openai optional params for embedding - user + encoding_format,
    are logged
    """
+    litellm.set_verbose = True
    customHandler_optional_params = MyCustomHandler()
    litellm.callbacks = [customHandler_optional_params]
    response = await litellm.aembedding(
--- a/litellm/tests/test_exceptions.py
+++ b/litellm/tests/test_exceptions.py
@ -55,8 +55,12 @@ async def test_content_policy_exception_azure():
    except litellm.ContentPolicyViolationError as e:
        print("caught a content policy violation error! Passed")
        print("exception", e)
+        assert e.litellm_debug_info is not None
+        assert isinstance(e.litellm_debug_info, str)
+        assert len(e.litellm_debug_info) > 0
        pass
    except Exception as e:
+        print()
        pytest.fail(f"An exception occurred - {str(e)}")


--- a/litellm/tests/test_image_generation.py
+++ b/litellm/tests/test_image_generation.py
@ -195,6 +195,8 @@ async def test_aimage_generation_vertex_ai():
            assert isinstance(d, litellm.ImageObject)
            print("data in response.data", d)
            assert d.b64_json is not None
+    except litellm.ServiceUnavailableError as e:
+        pass
    except litellm.RateLimitError as e:
        pass
    except litellm.ContentPolicyViolationError:
--- a/litellm/tests/test_jwt.py
+++ b/litellm/tests/test_jwt.py
@ -16,6 +16,7 @@ from litellm.proxy._types import LiteLLM_JWTAuth, LiteLLMRoutes
 from litellm.proxy.auth.handle_jwt import JWTHandler
 from litellm.caching import DualCache
 from datetime import datetime, timedelta
+from fastapi import Request

 public_key = {
    "kty": "RSA",
@ -346,6 +347,7 @@ async def test_team_token_output(prisma_client, audience):
                models=["gpt-3.5-turbo", "gpt-4"],
            ),
            user_api_key_dict=result,
+            http_request=Request(scope={"type": "http"}),
        )
    except Exception as e:
        pytest.fail(f"This should not fail - {str(e)}")
@ -534,6 +536,7 @@ async def test_user_token_output(
                models=["gpt-3.5-turbo", "gpt-4"],
            ),
            user_api_key_dict=result,
+            http_request=Request(scope={"type": "http"}),
        )
        if default_team_id:
            await new_team(
@ -544,6 +547,7 @@ async def test_user_token_output(
                    models=["gpt-3.5-turbo", "gpt-4"],
                ),
                user_api_key_dict=result,
+                http_request=Request(scope={"type": "http"}),
            )
    except Exception as e:
        pytest.fail(f"This should not fail - {str(e)}")
--- a/litellm/tests/test_key_generate_prisma.py
+++ b/litellm/tests/test_key_generate_prisma.py
@ -137,6 +137,7 @@ async def test_new_user_response(prisma_client):
            NewTeamRequest(
                team_id=_team_id,
            ),
+            http_request=Request(scope={"type": "http"}),
            user_api_key_dict=UserAPIKeyAuth(
                user_role=LitellmUserRoles.PROXY_ADMIN,
                api_key="sk-1234",
@ -272,7 +273,7 @@ def test_call_with_invalid_key(prisma_client):
    except Exception as e:
        print("Got Exception", e)
        print(e.message)
-        assert "Authentication Error, Invalid token passed" in e.message
+        assert "Authentication Error, Invalid proxy server token passed" in e.message
        pass


@ -368,6 +369,7 @@ async def test_call_with_valid_model_using_all_models(prisma_client):
        new_team_response = await new_team(
            data=team_request,
            user_api_key_dict=UserAPIKeyAuth(user_role=LitellmUserRoles.PROXY_ADMIN),
+            http_request=Request(scope={"type": "http"}),
        )
        print("new_team_response", new_team_response)
        created_team_id = new_team_response["team_id"]
@ -471,7 +473,7 @@ def test_call_with_user_over_budget(prisma_client):
        asyncio.run(test())
    except Exception as e:
        error_detail = e.message
-        assert "Authentication Error, ExceededBudget:" in error_detail
+        assert "Budget has been exceeded" in error_detail
        print(vars(e))


@ -652,7 +654,7 @@ def test_call_with_proxy_over_budget(prisma_client):
            error_detail = e.message
        else:
            error_detail = traceback.format_exc()
-        assert "Authentication Error, ExceededBudget:" in error_detail
+        assert "Budget has been exceeded" in error_detail
        print(vars(e))


@ -730,7 +732,7 @@ def test_call_with_user_over_budget_stream(prisma_client):
        asyncio.run(test())
    except Exception as e:
        error_detail = e.message
-        assert "Authentication Error, ExceededBudget:" in error_detail
+        assert "Budget has been exceeded" in error_detail
        print(vars(e))


@ -827,7 +829,7 @@ def test_call_with_proxy_over_budget_stream(prisma_client):
        asyncio.run(test())
    except Exception as e:
        error_detail = e.message
-        assert "Authentication Error, ExceededBudget:" in error_detail
+        assert "Budget has been exceeded" in error_detail
        print(vars(e))


@ -1086,6 +1088,7 @@ def test_generate_and_update_key(prisma_client):
                    api_key="sk-1234",
                    user_id="1234",
                ),
+                http_request=Request(scope={"type": "http"}),
            )

            _team_2 = "ishaan-special-team_{}".format(uuid.uuid4())
@ -1098,6 +1101,7 @@ def test_generate_and_update_key(prisma_client):
                    api_key="sk-1234",
                    user_id="1234",
                ),
+                http_request=Request(scope={"type": "http"}),
            )

            request = NewUserRequest(
@ -1175,7 +1179,6 @@ def test_generate_and_update_key(prisma_client):
        asyncio.run(test())
    except Exception as e:
        print("Got Exception", e)
-        print(e.message)
        pytest.fail(f"An exception occurred - {str(e)}")


@ -1363,7 +1366,7 @@ def test_call_with_key_over_budget(prisma_client):
            error_detail = e.message
        else:
            error_detail = str(e)
-        assert "Authentication Error, ExceededTokenBudget:" in error_detail
+        assert "Budget has been exceeded" in error_detail
        print(vars(e))


@ -1477,7 +1480,7 @@ def test_call_with_key_over_model_budget(prisma_client):
        # print(f"Error - {str(e)}")
        traceback.print_exc()
        error_detail = e.message
-        assert "Authentication Error, ExceededModelBudget:" in error_detail
+        assert "Budget has been exceeded!" in error_detail
        print(vars(e))


@ -1638,7 +1641,7 @@ async def test_call_with_key_over_budget_stream(prisma_client):
    except Exception as e:
        print("Got Exception", e)
        error_detail = e.message
-        assert "Authentication Error, ExceededTokenBudget:" in error_detail
+        assert "Budget has been exceeded" in error_detail
        print(vars(e))


@ -2051,6 +2054,7 @@ async def test_master_key_hashing(prisma_client):
                api_key="sk-1234",
                user_id="1234",
            ),
+            http_request=Request(scope={"type": "http"}),
        )

        _response = await new_user(
@ -2184,6 +2188,7 @@ async def test_create_update_team(prisma_client):
            tpm_limit=20,
            rpm_limit=20,
        ),
+        http_request=Request(scope={"type": "http"}),
        user_api_key_dict=UserAPIKeyAuth(
            user_role=LitellmUserRoles.PROXY_ADMIN,
            api_key="sk-1234",
@ -2233,7 +2238,10 @@ async def test_create_update_team(prisma_client):
    )

    # now hit team_info
-    response = await team_info(team_id=_team_id)
+    response = await team_info(
+        team_id=_team_id,
+        http_request=Request(scope={"type": "http"}),
+    )

    print("RESPONSE from team_info", response)

--- a/litellm/tests/test_router_fallbacks.py
+++ b/litellm/tests/test_router_fallbacks.py
@ -1059,3 +1059,53 @@ async def test_default_model_fallbacks(sync_mode, litellm_module_fallbacks):

    assert isinstance(response, litellm.ModelResponse)
    assert response.model is not None and response.model == "gpt-4o"
+
+
+@pytest.mark.parametrize("sync_mode", [True, False])
+@pytest.mark.asyncio
+async def test_client_side_fallbacks_list(sync_mode):
+    """
+
+    Tests Client Side Fallbacks
+
+    User can pass "fallbacks": ["gpt-3.5-turbo"] and this should work
+
+    """
+    router = Router(
+        model_list=[
+            {
+                "model_name": "bad-model",
+                "litellm_params": {
+                    "model": "openai/my-bad-model",
+                    "api_key": "my-bad-api-key",
+                },
+            },
+            {
+                "model_name": "my-good-model",
+                "litellm_params": {
+                    "model": "gpt-4o",
+                    "api_key": os.getenv("OPENAI_API_KEY"),
+                },
+            },
+        ],
+    )
+
+    if sync_mode:
+        response = router.completion(
+            model="bad-model",
+            messages=[{"role": "user", "content": "Hey, how's it going?"}],
+            fallbacks=["my-good-model"],
+            mock_testing_fallbacks=True,
+            mock_response="Hey! nice day",
+        )
+    else:
+        response = await router.acompletion(
+            model="bad-model",
+            messages=[{"role": "user", "content": "Hey, how's it going?"}],
+            fallbacks=["my-good-model"],
+            mock_testing_fallbacks=True,
+            mock_response="Hey! nice day",
+        )
+
+    assert isinstance(response, litellm.ModelResponse)
+    assert response.model is not None and response.model == "gpt-4o"
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@ -1463,6 +1463,10 @@ async def test_parallel_streaming_requests(sync_mode, model):

    except RateLimitError:
        pass
+    except litellm.InternalServerError as e:
+        if "predibase" in str(e).lower():
+            # only skip internal server error from predibase - their endpoint seems quite unstable
+            pass
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")

@ -2535,7 +2539,10 @@ def streaming_and_function_calling_format_tests(idx, chunk):
    return extracted_chunk, finished


-def test_openai_streaming_and_function_calling():
+@pytest.mark.parametrize(
+    "model", ["gpt-3.5-turbo", "anthropic.claude-3-sonnet-20240229-v1:0"]
+)
+def test_streaming_and_function_calling(model):
    tools = [
        {
            "type": "function",
@ -2556,16 +2563,21 @@ def test_openai_streaming_and_function_calling():
            },
        }
    ]
+
    messages = [{"role": "user", "content": "What is the weather like in Boston?"}]
    try:
-        response = completion(
-            model="gpt-3.5-turbo",
+        litellm.set_verbose = True
+        response: litellm.CustomStreamWrapper = completion(
+            model=model,
            tools=tools,
            messages=messages,
            stream=True,
-        )
+            tool_choice="required",
+        )  # type: ignore
        # Add any assertions here to check the response
        for idx, chunk in enumerate(response):
+            # continue
+            print("\n{}\n".format(chunk))
            if idx == 0:
                assert (
                    chunk.choices[0].delta.tool_calls[0].function.arguments is not None
@ -2573,6 +2585,7 @@ def test_openai_streaming_and_function_calling():
                assert isinstance(
                    chunk.choices[0].delta.tool_calls[0].function.arguments, str
                )
+        # assert False
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
        raise e
--- a/litellm/tests/test_text_completion.py
+++ b/litellm/tests/test_text_completion.py
@ -3990,6 +3990,7 @@ def test_async_text_completion():
    asyncio.run(test_get_response())


+@pytest.mark.skip(reason="Tgai endpoints are unstable")
 def test_async_text_completion_together_ai():
    litellm.set_verbose = True
    print("test_async_text_completion")
--- a/litellm/tests/test_token_counter.py
+++ b/litellm/tests/test_token_counter.py
@ -187,12 +187,43 @@ def test_load_test_token_counter(model):
    print("model={}, total test time={}".format(model, total_time))
    assert total_time < 10, f"Total encoding time > 10s, {total_time}"

+
 def test_openai_token_with_image_and_text():
    model = "gpt-4o"
-    full_request = {'model': 'gpt-4o', 'tools': [{'type': 'function', 'function': {'name': 'json', 'parameters': {'type': 'object', 'required': ['clause'], 'properties': {'clause': {'type': 'string'}}}, 'description': 'Respond with a JSON object.'}}], 'logprobs': False, 'messages': [{'role': 'user', 'content': [{'text': '\n    Just some long text, long long text, and you know it will be longer than 7 tokens definetly.', 'type': 'text'}]}], 'tool_choice': {'type': 'function', 'function': {'name': 'json'}}, 'exclude_models': [], 'disable_fallback': False, 'exclude_providers': []}
+    full_request = {
+        "model": "gpt-4o",
+        "tools": [
+            {
+                "type": "function",
+                "function": {
+                    "name": "json",
+                    "parameters": {
+                        "type": "object",
+                        "required": ["clause"],
+                        "properties": {"clause": {"type": "string"}},
+                    },
+                    "description": "Respond with a JSON object.",
+                },
+            }
+        ],
+        "logprobs": False,
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "text": "\n    Just some long text, long long text, and you know it will be longer than 7 tokens definetly.",
+                        "type": "text",
+                    }
+                ],
+            }
+        ],
+        "tool_choice": {"type": "function", "function": {"name": "json"}},
+        "exclude_models": [],
+        "disable_fallback": False,
+        "exclude_providers": [],
+    }
    messages = full_request.get("messages", [])

    token_count = token_counter(model=model, messages=messages)
    print(token_count)
-    
-test_openai_token_with_image_and_text()
--- a/litellm/tests/test_utils.py
+++ b/litellm/tests/test_utils.py
@ -23,6 +23,7 @@ from litellm.utils import (
    create_pretrained_tokenizer,
    create_tokenizer,
    get_max_tokens,
+    get_supported_openai_params,
 )

 # Assuming your trim_messages, shorten_message_to_fit_limit, and get_token_count functions are all in a module named 'message_utils'
@ -386,3 +387,11 @@ def test_get_max_token_unit_test():
    )  # Returns a number instead of throwing an Exception

    assert isinstance(max_tokens, int)
+
+
+def test_get_supported_openai_params() -> None:
+    # Mapped provider
+    assert isinstance(get_supported_openai_params("gpt-4"), list)
+
+    # Unmapped provider
+    assert get_supported_openai_params("nonexistent") is None
--- a/litellm/types/llms/bedrock.py
+++ b/litellm/types/llms/bedrock.py
@ -1,5 +1,6 @@
 from typing import TypedDict, Any, Union, Optional, Literal, List
 import json
+from .openai import ChatCompletionToolCallChunk
 from typing_extensions import (
    Self,
    Protocol,
@ -118,6 +119,15 @@ class ToolBlockDeltaEvent(TypedDict):
    input: str


+class ToolUseBlockStartEvent(TypedDict):
+    name: str
+    toolUseId: str
+
+
+class ContentBlockStartEvent(TypedDict, total=False):
+    toolUse: Optional[ToolUseBlockStartEvent]
+
+
 class ContentBlockDeltaEvent(TypedDict, total=False):
    """
    Either 'text' or 'toolUse' will be specified for Converse API streaming response.
@ -138,10 +148,11 @@ class RequestObject(TypedDict, total=False):

 class GenericStreamingChunk(TypedDict):
    text: Required[str]
-    tool_str: Required[str]
+    tool_use: Optional[ChatCompletionToolCallChunk]
    is_finished: Required[bool]
    finish_reason: Required[str]
    usage: Optional[ConverseTokenUsageBlock]
+    index: int


 class Document(TypedDict):
--- a/litellm/types/llms/openai.py
+++ b/litellm/types/llms/openai.py
@ -296,14 +296,27 @@ class ListBatchRequest(TypedDict, total=False):


 class ChatCompletionToolCallFunctionChunk(TypedDict):
-    name: str
+    name: Optional[str]
    arguments: str


 class ChatCompletionToolCallChunk(TypedDict):
+    id: Optional[str]
+    type: Literal["function"]
+    function: ChatCompletionToolCallFunctionChunk
+
+
+class ChatCompletionDeltaToolCallChunk(TypedDict):
    id: str
    type: Literal["function"]
    function: ChatCompletionToolCallFunctionChunk
+    index: int
+
+
+class ChatCompletionDeltaChunk(TypedDict, total=False):
+    content: Optional[str]
+    tool_calls: List[ChatCompletionDeltaToolCallChunk]
+    role: str


 class ChatCompletionResponseMessage(TypedDict, total=False):
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -30,7 +30,7 @@ from dataclasses import (
    dataclass,
    field,
 )
-
+import os
 import litellm._service_logger  # for storing API inputs, outputs, and metadata
 from litellm.llms.custom_httpx.http_handler import HTTPHandler, AsyncHTTPHandler
 from litellm.caching import DualCache
@ -49,9 +49,9 @@ except (ImportError, AttributeError):

    filename = pkg_resources.resource_filename(__name__, "llms/tokenizers")

-os.environ["TIKTOKEN_CACHE_DIR"] = (
-    filename  # use local copy of tiktoken b/c of - https://github.com/BerriAI/litellm/issues/1071
-)
+os.environ["TIKTOKEN_CACHE_DIR"] = os.getenv(
+    "CUSTOM_TIKTOKEN_CACHE_DIR", filename
+)  # use local copy of tiktoken b/c of - https://github.com/BerriAI/litellm/issues/1071

 encoding = tiktoken.get_encoding("cl100k_base")
 from importlib import resources
@ -63,6 +63,11 @@ claude_json_str = json.dumps(json_data)
 import importlib.metadata
 from ._logging import verbose_logger
 from .types.router import LiteLLM_Params
+from .types.llms.openai import (
+    ChatCompletionToolCallChunk,
+    ChatCompletionToolCallFunctionChunk,
+    ChatCompletionDeltaToolCallChunk,
+)
 from .integrations.traceloop import TraceloopLogger
 from .integrations.athina import AthinaLogger
 from .integrations.helicone import HeliconeLogger
@ -933,7 +938,6 @@ class TextCompletionResponse(OpenAIObject):
        object=None,
        **params,
    ):
-
        if stream:
            object = "text_completion.chunk"
            choices = [TextChoices()]
@ -942,7 +946,6 @@ class TextCompletionResponse(OpenAIObject):
            if choices is not None and isinstance(choices, list):
                new_choices = []
                for choice in choices:
-
                    if isinstance(choice, TextChoices):
                        _new_choice = choice
                    elif isinstance(choice, dict):
@ -1018,7 +1021,6 @@ class ImageObject(OpenAIObject):
    revised_prompt: Optional[str] = None

    def __init__(self, b64_json=None, url=None, revised_prompt=None):
-
        super().__init__(b64_json=b64_json, url=url, revised_prompt=revised_prompt)

    def __contains__(self, key):
@ -1342,28 +1344,29 @@ class Logging:
                )
            else:
                verbose_logger.debug(f"\033[92m{curl_command}\033[0m\n")
-            # log raw request to provider (like LangFuse)
-            try:
-                # [Non-blocking Extra Debug Information in metadata]
-                _litellm_params = self.model_call_details.get("litellm_params", {})
-                _metadata = _litellm_params.get("metadata", {}) or {}
-                if (
-                    litellm.turn_off_message_logging is not None
-                    and litellm.turn_off_message_logging is True
-                ):
+            # log raw request to provider (like LangFuse) -- if opted in.
+            if litellm.log_raw_request_response is True:
+                try:
+                    # [Non-blocking Extra Debug Information in metadata]
+                    _litellm_params = self.model_call_details.get("litellm_params", {})
+                    _metadata = _litellm_params.get("metadata", {}) or {}
+                    if (
+                        litellm.turn_off_message_logging is not None
+                        and litellm.turn_off_message_logging is True
+                    ):
+                        _metadata["raw_request"] = (
+                            "redacted by litellm. \
+                            'litellm.turn_off_message_logging=True'"
+                        )
+                    else:
+                        _metadata["raw_request"] = str(curl_command)
+                except Exception as e:
                    _metadata["raw_request"] = (
-                        "redacted by litellm. \
-                        'litellm.turn_off_message_logging=True'"
+                        "Unable to Log \
+                        raw request: {}".format(
+                            str(e)
+                        )
                    )
-                else:
-                    _metadata["raw_request"] = str(curl_command)
-            except Exception as e:
-                _metadata["raw_request"] = (
-                    "Unable to Log \
-                    raw request: {}".format(
-                        str(e)
-                    )
-                )
            if self.logger_fn and callable(self.logger_fn):
                try:
                    self.logger_fn(
@ -1621,7 +1624,6 @@ class Logging:
                            end_time=end_time,
                        )
                    except Exception as e:
-
                        complete_streaming_response = None
                else:
                    self.sync_streaming_chunks.append(result)
@ -2391,7 +2393,6 @@ class Logging:
                            "async_complete_streaming_response"
                            in self.model_call_details
                        ):
-
                            await customLogger.async_log_event(
                                kwargs=self.model_call_details,
                                response_obj=self.model_call_details[
@ -2730,7 +2731,7 @@ class Logging:
        only redacts when litellm.turn_off_message_logging == True
        """
        # check if user opted out of logging message/response to callbacks
-        if litellm.turn_off_message_logging == True:
+        if litellm.turn_off_message_logging is True:
            # remove messages, prompts, input, response from logging
            self.model_call_details["messages"] = [
                {"role": "user", "content": "redacted-by-litellm"}
@ -3250,7 +3251,7 @@ def client(original_function):
                                    stream=kwargs.get("stream", False),
                                )

-                                if kwargs.get("stream", False) == True:
+                                if kwargs.get("stream", False) is True:
                                    cached_result = CustomStreamWrapper(
                                        completion_stream=cached_result,
                                        model=model,
@ -4030,7 +4031,10 @@ def openai_token_counter(
    """
    print_verbose(f"LiteLLM: Utils - Counting tokens for OpenAI model={model}")
    try:
-        encoding = tiktoken.encoding_for_model(model)
+        if "gpt-4o" in model:
+            encoding = tiktoken.get_encoding("o200k_base")
+        else:
+            encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print_verbose("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
@ -4894,6 +4898,18 @@ def get_optional_params_embeddings(
        )
        final_params = {**optional_params, **kwargs}
        return final_params
+    if custom_llm_provider == "vertex_ai":
+        supported_params = get_supported_openai_params(
+            model=model,
+            custom_llm_provider="vertex_ai",
+            request_type="embeddings",
+        )
+        _check_valid_arg(supported_params=supported_params)
+        optional_params = litellm.VertexAITextEmbeddingConfig().map_openai_params(
+            non_default_params=non_default_params, optional_params={}
+        )
+        final_params = {**optional_params, **kwargs}
+        return final_params
    if custom_llm_provider == "vertex_ai":
        if len(non_default_params.keys()) > 0:
            if litellm.drop_params is True:  # drop the unsupported non-default values
@ -4927,7 +4943,18 @@ def get_optional_params_embeddings(
                message=f"Setting user/encoding format is not supported by {custom_llm_provider}. To drop it from the call, set `litellm.drop_params = True`.",
            )
        return {**non_default_params, **kwargs}
-
+    if custom_llm_provider == "mistral":
+        supported_params = get_supported_openai_params(
+            model=model,
+            custom_llm_provider="mistral",
+            request_type="embeddings",
+        )
+        _check_valid_arg(supported_params=supported_params)
+        optional_params = litellm.MistralEmbeddingConfig().map_openai_params(
+            non_default_params=non_default_params, optional_params={}
+        )
+        final_params = {**optional_params, **kwargs}
+        return final_params
    if (
        custom_llm_provider != "openai"
        and custom_llm_provider != "azure"
@ -6166,13 +6193,16 @@ def get_api_base(
    if litellm.model_alias_map and model in litellm.model_alias_map:
        model = litellm.model_alias_map[model]
    try:
-        model, custom_llm_provider, dynamic_api_key, dynamic_api_base = (
-            get_llm_provider(
-                model=model,
-                custom_llm_provider=_optional_params.custom_llm_provider,
-                api_base=_optional_params.api_base,
-                api_key=_optional_params.api_key,
-            )
+        (
+            model,
+            custom_llm_provider,
+            dynamic_api_key,
+            dynamic_api_base,
+        ) = get_llm_provider(
+            model=model,
+            custom_llm_provider=_optional_params.custom_llm_provider,
+            api_base=_optional_params.api_base,
+            api_key=_optional_params.api_key,
        )
    except Exception as e:
        verbose_logger.debug("Error occurred in getting api base - {}".format(str(e)))
@ -6220,7 +6250,7 @@ def get_first_chars_messages(kwargs: dict) -> str:

 def get_supported_openai_params(
    model: str,
-    custom_llm_provider: str,
+    custom_llm_provider: Optional[str] = None,
    request_type: Literal["chat_completion", "embeddings"] = "chat_completion",
 ) -> Optional[list]:
    """
@ -6235,6 +6265,11 @@ def get_supported_openai_params(
    - List if custom_llm_provider is mapped
    - None if unmapped
    """
+    if not custom_llm_provider:
+        try:
+            custom_llm_provider = litellm.get_llm_provider(model=model)[1]
+        except BadRequestError:
+            return None
    if custom_llm_provider == "bedrock":
        return litellm.AmazonConverseConfig().get_supported_openai_params(model=model)
    elif custom_llm_provider == "ollama":
@ -6328,7 +6363,10 @@ def get_supported_openai_params(
            "max_retries",
        ]
    elif custom_llm_provider == "mistral":
-        return litellm.MistralConfig().get_supported_openai_params()
+        if request_type == "chat_completion":
+            return litellm.MistralConfig().get_supported_openai_params()
+        elif request_type == "embeddings":
+            return litellm.MistralEmbeddingConfig().get_supported_openai_params()
    elif custom_llm_provider == "replicate":
        return [
            "stream",
@ -6370,7 +6408,10 @@ def get_supported_openai_params(
    elif custom_llm_provider == "palm" or custom_llm_provider == "gemini":
        return ["temperature", "top_p", "stream", "n", "stop", "max_tokens"]
    elif custom_llm_provider == "vertex_ai":
-        return litellm.VertexAIConfig().get_supported_openai_params()
+        if request_type == "chat_completion":
+            return litellm.VertexAIConfig().get_supported_openai_params()
+        elif request_type == "embeddings":
+            return litellm.VertexAITextEmbeddingConfig().get_supported_openai_params()
    elif custom_llm_provider == "sagemaker":
        return ["stream", "temperature", "max_tokens", "top_p", "stop", "n"]
    elif custom_llm_provider == "aleph_alpha":
@ -6577,6 +6618,9 @@ def get_llm_provider(
                    or get_secret("FIREWORKSAI_API_KEY")
                    or get_secret("FIREWORKS_AI_TOKEN")
                )
+            elif custom_llm_provider == "azure_ai":
+                api_base = api_base or get_secret("AZURE_AI_API_BASE")  # type: ignore
+                dynamic_api_key = api_key or get_secret("AZURE_AI_API_KEY")
            elif custom_llm_provider == "mistral":
                # mistral is openai compatible, we just need to set this to custom_openai and have the api_base be https://api.mistral.ai
                api_base = (
@ -7458,7 +7502,6 @@ def validate_environment(model: Optional[str] = None) -> dict:


 def set_callbacks(callback_list, function_id=None):
-
    global sentry_sdk_instance, capture_exception, add_breadcrumb, posthog, slack_app, alerts_channel, traceloopLogger, athinaLogger, heliconeLogger, aispendLogger, berrispendLogger, supabaseClient, liteDebuggerClient, lunaryLogger, promptLayerLogger, langFuseLogger, customLogger, weightsBiasesLogger, langsmithLogger, logfireLogger, dynamoLogger, s3Logger, dataDogLogger, prometheusLogger, greenscaleLogger, openMeterLogger

    try:
@ -8767,6 +8810,13 @@ def exception_type(
                        response=original_exception.response,
                        litellm_debug_info=extra_information,
                    )
+                if "Request failed during generation" in error_str:
+                    # this is an internal server error from predibase
+                    raise litellm.InternalServerError(
+                        message=f"PredibaseException - {error_str}",
+                        llm_provider="predibase",
+                        model=model,
+                    )
                elif hasattr(original_exception, "status_code"):
                    if original_exception.status_code == 500:
                        exception_mapping_worked = True
@ -9085,7 +9135,7 @@ def exception_type(
                ):
                    exception_mapping_worked = True
                    raise RateLimitError(
-                        message=f"VertexAIException RateLimitError - {error_str}",
+                        message=f"litellm.RateLimitError: VertexAIException - {error_str}",
                        model=model,
                        llm_provider="vertex_ai",
                        litellm_debug_info=extra_information,
@ -9097,7 +9147,14 @@ def exception_type(
                            ),
                        ),
                    )
-
+                elif "500 Internal Server Error" in error_str:
+                    exception_mapping_worked = True
+                    raise ServiceUnavailableError(
+                        message=f"litellm.ServiceUnavailableError: VertexAIException - {error_str}",
+                        model=model,
+                        llm_provider="vertex_ai",
+                        litellm_debug_info=extra_information,
+                    )
                if hasattr(original_exception, "status_code"):
                    if original_exception.status_code == 400:
                        exception_mapping_worked = True
@ -10048,6 +10105,14 @@ def get_secret(
                return oidc_token
            else:
                raise ValueError("Github OIDC provider failed")
+        elif oidc_provider == "azure":
+            # https://azure.github.io/azure-workload-identity/docs/quick-start.html
+            azure_federated_token_file = os.getenv("AZURE_FEDERATED_TOKEN_FILE")
+            if azure_federated_token_file is None:
+                raise ValueError("AZURE_FEDERATED_TOKEN_FILE not found in environment")
+            with open(azure_federated_token_file, "r") as f:
+                oidc_token = f.read()
+                return oidc_token
        else:
            raise ValueError("Unsupported OIDC provider")

@ -11311,7 +11376,6 @@ class CustomStreamWrapper:
                    raise StopIteration
                response_obj: GenericStreamingChunk = chunk
                completion_obj["content"] = response_obj["text"]
-
                if response_obj["is_finished"]:
                    self.received_finish_reason = response_obj["finish_reason"]

@ -11326,6 +11390,10 @@ class CustomStreamWrapper:
                        completion_tokens=response_obj["usage"]["outputTokens"],
                        total_tokens=response_obj["usage"]["totalTokens"],
                    )
+
+                if "tool_use" in response_obj and response_obj["tool_use"] is not None:
+                    completion_obj["tool_calls"] = [response_obj["tool_use"]]
+
            elif self.custom_llm_provider == "sagemaker":
                print_verbose(f"ENTERS SAGEMAKER STREAMING for chunk {chunk}")
                response_obj = self.handle_sagemaker_stream(chunk)
@ -11342,7 +11410,6 @@ class CustomStreamWrapper:
                new_chunk = self.completion_stream[:chunk_size]
                completion_obj["content"] = new_chunk
                self.completion_stream = self.completion_stream[chunk_size:]
-                time.sleep(0.05)
            elif self.custom_llm_provider == "palm":
                # fake streaming
                response_obj = {}
@ -11355,7 +11422,6 @@ class CustomStreamWrapper:
                new_chunk = self.completion_stream[:chunk_size]
                completion_obj["content"] = new_chunk
                self.completion_stream = self.completion_stream[chunk_size:]
-                time.sleep(0.05)
            elif self.custom_llm_provider == "ollama":
                response_obj = self.handle_ollama_stream(chunk)
                completion_obj["content"] = response_obj["text"]
@ -11442,7 +11508,7 @@ class CustomStreamWrapper:
                        # for azure, we need to pass the model from the orignal chunk
                        self.model = chunk.model
                response_obj = self.handle_openai_chat_completion_chunk(chunk)
-                if response_obj == None:
+                if response_obj is None:
                    return
                completion_obj["content"] = response_obj["text"]
                print_verbose(f"completion obj content: {completion_obj['content']}")
@ -11575,7 +11641,7 @@ class CustomStreamWrapper:
                else:
                    if (
                        self.stream_options is not None
-                        and self.stream_options["include_usage"] == True
+                        and self.stream_options["include_usage"] is True
                    ):
                        return model_response
                    return
@ -11600,8 +11666,14 @@ class CustomStreamWrapper:
                return model_response
            elif (
                "content" in completion_obj
-                and isinstance(completion_obj["content"], str)
-                and len(completion_obj["content"]) > 0
+                and (
+                    isinstance(completion_obj["content"], str)
+                    and len(completion_obj["content"]) > 0
+                )
+                or (
+                    "tool_calls" in completion_obj
+                    and len(completion_obj["tool_calls"]) > 0
+                )
            ):  # cannot set content of an OpenAI Object to be an empty string
                hold, model_response_str = self.check_special_tokens(
                    chunk=completion_obj["content"],
@ -11657,7 +11729,7 @@ class CustomStreamWrapper:
                    else:
                        ## else
                        completion_obj["content"] = model_response_str
-                        if self.sent_first_chunk == False:
+                        if self.sent_first_chunk is False:
                            completion_obj["role"] = "assistant"
                            self.sent_first_chunk = True
                        model_response.choices[0].delta = Delta(**completion_obj)
@ -11666,7 +11738,7 @@ class CustomStreamWrapper:
                else:
                    return
            elif self.received_finish_reason is not None:
-                if self.sent_last_chunk == True:
+                if self.sent_last_chunk is True:
                    raise StopIteration
                # flush any remaining holding chunk
                if len(self.holding_chunk) > 0:
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -1387,6 +1387,26 @@
        "mode": "image_generation",
        "source": "https://cloud.google.com/vertex-ai/generative-ai/pricing"
    },
+    "text-embedding-004": {
+        "max_tokens": 3072,
+        "max_input_tokens": 3072,
+        "output_vector_size": 768,
+        "input_cost_per_token": 0.00000000625,
+        "output_cost_per_token": 0,
+        "litellm_provider": "vertex_ai-embedding-models",
+        "mode": "embedding",
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models"
+    },
+    "text-multilingual-embedding-002": {
+        "max_tokens": 2048,
+        "max_input_tokens": 2048,
+        "output_vector_size": 768,
+        "input_cost_per_token": 0.00000000625,
+        "output_cost_per_token": 0,
+        "litellm_provider": "vertex_ai-embedding-models",
+        "mode": "embedding",
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models"
+    },
    "textembedding-gecko": {
        "max_tokens": 3072,
        "max_input_tokens": 3072,
--- a/poetry.lock
+++ b/poetry.lock
@ -1545,6 +1545,53 @@ files = [
    {file = "multidict-6.0.5.tar.gz", hash = "sha256:f7e301075edaf50500f0b341543c41194d8df3ae5caf4702f2095f3ca73dd8da"},
 ]

+[[package]]
+name = "mypy"
+version = "1.10.0"
+description = "Optional static typing for Python"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "mypy-1.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:da1cbf08fb3b851ab3b9523a884c232774008267b1f83371ace57f412fe308c2"},
+    {file = "mypy-1.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:12b6bfc1b1a66095ab413160a6e520e1dc076a28f3e22f7fb25ba3b000b4ef99"},
+    {file = "mypy-1.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e36fb078cce9904c7989b9693e41cb9711e0600139ce3970c6ef814b6ebc2b2"},
+    {file = "mypy-1.10.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2b0695d605ddcd3eb2f736cd8b4e388288c21e7de85001e9f85df9187f2b50f9"},
+    {file = "mypy-1.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:cd777b780312ddb135bceb9bc8722a73ec95e042f911cc279e2ec3c667076051"},
+    {file = "mypy-1.10.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3be66771aa5c97602f382230165b856c231d1277c511c9a8dd058be4784472e1"},
+    {file = "mypy-1.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8b2cbaca148d0754a54d44121b5825ae71868c7592a53b7292eeb0f3fdae95ee"},
+    {file = "mypy-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ec404a7cbe9fc0e92cb0e67f55ce0c025014e26d33e54d9e506a0f2d07fe5de"},
+    {file = "mypy-1.10.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e22e1527dc3d4aa94311d246b59e47f6455b8729f4968765ac1eacf9a4760bc7"},
+    {file = "mypy-1.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:a87dbfa85971e8d59c9cc1fcf534efe664d8949e4c0b6b44e8ca548e746a8d53"},
+    {file = "mypy-1.10.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:a781f6ad4bab20eef8b65174a57e5203f4be627b46291f4589879bf4e257b97b"},
+    {file = "mypy-1.10.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b808e12113505b97d9023b0b5e0c0705a90571c6feefc6f215c1df9381256e30"},
+    {file = "mypy-1.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f55583b12156c399dce2df7d16f8a5095291354f1e839c252ec6c0611e86e2e"},
+    {file = "mypy-1.10.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4cf18f9d0efa1b16478c4c129eabec36148032575391095f73cae2e722fcf9d5"},
+    {file = "mypy-1.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:bc6ac273b23c6b82da3bb25f4136c4fd42665f17f2cd850771cb600bdd2ebeda"},
+    {file = "mypy-1.10.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9fd50226364cd2737351c79807775136b0abe084433b55b2e29181a4c3c878c0"},
+    {file = "mypy-1.10.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f90cff89eea89273727d8783fef5d4a934be2fdca11b47def50cf5d311aff727"},
+    {file = "mypy-1.10.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fcfc70599efde5c67862a07a1aaf50e55bce629ace26bb19dc17cece5dd31ca4"},
+    {file = "mypy-1.10.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:075cbf81f3e134eadaf247de187bd604748171d6b79736fa9b6c9685b4083061"},
+    {file = "mypy-1.10.0-cp38-cp38-win_amd64.whl", hash = "sha256:3f298531bca95ff615b6e9f2fc0333aae27fa48052903a0ac90215021cdcfa4f"},
+    {file = "mypy-1.10.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fa7ef5244615a2523b56c034becde4e9e3f9b034854c93639adb667ec9ec2976"},
+    {file = "mypy-1.10.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3236a4c8f535a0631f85f5fcdffba71c7feeef76a6002fcba7c1a8e57c8be1ec"},
+    {file = "mypy-1.10.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a2b5cdbb5dd35aa08ea9114436e0d79aceb2f38e32c21684dcf8e24e1e92821"},
+    {file = "mypy-1.10.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:92f93b21c0fe73dc00abf91022234c79d793318b8a96faac147cd579c1671746"},
+    {file = "mypy-1.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:28d0e038361b45f099cc086d9dd99c15ff14d0188f44ac883010e172ce86c38a"},
+    {file = "mypy-1.10.0-py3-none-any.whl", hash = "sha256:f8c083976eb530019175aabadb60921e73b4f45736760826aa1689dda8208aee"},
+    {file = "mypy-1.10.0.tar.gz", hash = "sha256:3d087fcbec056c4ee34974da493a826ce316947485cef3901f511848e687c131"},
+]
+
+[package.dependencies]
+mypy-extensions = ">=1.0.0"
+tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
+typing-extensions = ">=4.1.0"
+
+[package.extras]
+dmypy = ["psutil (>=4.0)"]
+install-types = ["pip"]
+mypyc = ["setuptools (>=50)"]
+reports = ["lxml"]
+
 [[package]]
 name = "mypy-extensions"
 version = "1.0.0"
@ -2127,6 +2174,7 @@ files = [
    {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
    {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
    {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
    {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
    {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@ -3150,4 +3198,4 @@ proxy = ["PyJWT", "apscheduler", "backoff", "cryptography", "fastapi", "fastapi-
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8.1,<4.0, !=3.9.7"
-content-hash = "6a37992b63b11d254f5f40687bd96898b1d9515728f663f30dcc81c4ef8df7b7"
+content-hash = "73054c657782120d170dc168ef07b494a916f1f810ff9c2b0ac878bd857a9dac"
--- a/proxy_server_config.yaml
+++ b/proxy_server_config.yaml
@ -85,6 +85,9 @@ model_list:
    litellm_params:
      model: openai/*
      api_key: os.environ/OPENAI_API_KEY
+  - model_name: mistral-embed
+    litellm_params:
+      model: mistral/mistral-embed
  - model_name: gpt-instruct # [PROD TEST] - tests if `/health` automatically infers this to be a text completion model
    litellm_params:
      model: text-completion-openai/gpt-3.5-turbo-instruct
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.40.8"
+version = "1.40.9"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@ -19,7 +19,7 @@ documentation = "https://docs.litellm.ai"
 python = ">=3.8.1,<4.0, !=3.9.7"
 openai = ">=1.27.0"
 python-dotenv = ">=0.2.0"
-tiktoken = ">=0.4.0"
+tiktoken = ">=0.7.0"
 importlib-metadata = ">=6.8.0"
 tokenizers = "*"
 click = "*"
@ -76,6 +76,7 @@ litellm = 'litellm:run_server'
 [tool.poetry.group.dev.dependencies]
 flake8 = "^6.1.0"
 black = "^23.12.0"
+mypy = "^1.0"
 pytest = "^7.4.3"
 pytest-mock = "^3.12.0"

@ -84,7 +85,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"

 [tool.commitizen]
-version = "1.40.8"
+version = "1.40.9"
 version_files = [
    "pyproject.toml:^version"
 ]
--- a/requirements.txt
+++ b/requirements.txt
@ -34,7 +34,7 @@ opentelemetry-exporter-otlp==1.25.0

 ### LITELLM PACKAGE DEPENDENCIES
 python-dotenv==1.0.0 # for env 
-tiktoken==0.6.0 # for calculating usage
+tiktoken==0.7.0 # for calculating usage
 importlib-metadata==6.8.0 # for random utils
 tokenizers==0.14.0 # for calculating usage
 click==8.1.7 # for proxy cli 
--- a/schema.prisma
+++ b/schema.prisma
@ -91,7 +91,7 @@ model LiteLLM_TeamTable {
    updated_at    DateTime               @default(now()) @updatedAt @map("updated_at")
    model_spend      Json @default("{}")
    model_max_budget Json @default("{}")
-    model_id Int? @unique
+    model_id Int? @unique // id for LiteLLM_ModelTable -> stores team-level model aliases
    litellm_organization_table LiteLLM_OrganizationTable?   @relation(fields: [organization_id], references: [organization_id])
    litellm_model_table LiteLLM_ModelTable? @relation(fields: [model_id], references: [id])
 }
--- a/tests/test_keys.py
+++ b/tests/test_keys.py
@ -664,7 +664,7 @@ async def test_key_crossing_budget():
            response = await chat_completion(session=session, key=key)
            pytest.fail("Should have failed - Key crossed it's budget")
        except Exception as e:
-            assert "ExceededTokenBudget: Current spend for token:" in str(e)
+            assert "Budget has been exceeded!" in str(e)


@pytest.mark.skip(reason="AWS Suspended Account")
--- a/tests/test_openai_endpoints.py
+++ b/tests/test_openai_endpoints.py
@ -22,6 +22,7 @@ async def generate_key(
        "text-embedding-ada-002",
        "dall-e-2",
        "fake-openai-endpoint-2",
+        "mistral-embed",
    ],
 ):
    url = "http://0.0.0.0:4000/key/generate"
@ -197,14 +198,14 @@ async def completion(session, key):
        return response


-async def embeddings(session, key):
+async def embeddings(session, key, model="text-embedding-ada-002"):
    url = "http://0.0.0.0:4000/embeddings"
    headers = {
        "Authorization": f"Bearer {key}",
        "Content-Type": "application/json",
    }
    data = {
-        "model": "text-embedding-ada-002",
+        "model": model,
        "input": ["hello world"],
    }

@ -408,6 +409,9 @@ async def test_embeddings():
        key_2 = key_gen["key"]
        await embeddings(session=session, key=key_2)

+        # embedding request with non OpenAI model
+        await embeddings(session=session, key=key, model="mistral-embed")
+

@pytest.mark.asyncio
 async def test_image_generation():
--- a/tests/test_team.py
+++ b/tests/test_team.py
@ -49,7 +49,7 @@ async def new_user(


 async def add_member(
-    session, i, team_id, user_id=None, user_email=None, max_budget=None
+    session, i, team_id, user_id=None, user_email=None, max_budget=None, members=None
 ):
    url = "http://0.0.0.0:4000/team/member_add"
    headers = {"Authorization": "Bearer sk-1234", "Content-Type": "application/json"}
@ -58,10 +58,13 @@ async def add_member(
        data["member"]["user_email"] = user_email
    elif user_id is not None:
        data["member"]["user_id"] = user_id
+    elif members is not None:
+        data["member"] = members

    if max_budget is not None:
        data["max_budget_in_team"] = max_budget

+    print("sent data: {}".format(data))
    async with session.post(url, headers=headers, json=data) as response:
        status = response.status
        response_text = await response.text()
@ -339,7 +342,7 @@ async def test_team_info():
 async def test_team_update_sc_2():
    """
    - Create team
-    - Add 1 user (doesn't exist in db)
+    - Add 3 users (doesn't exist in db)
    - Change team alias
    - Check if it works
    - Assert team object unchanged besides team alias
@ -353,15 +356,20 @@ async def test_team_update_sc_2():
            {"role": "admin", "user_id": admin_user},
        ]
        team_data = await new_team(session=session, i=0, member_list=member_list)
-        ## Create new normal user
-        new_normal_user = f"krrish_{uuid.uuid4()}@berri.ai"
+        ## Create 10 normal users
+        members = [
+            {"role": "user", "user_id": f"krrish_{uuid.uuid4()}@berri.ai"}
+            for _ in range(10)
+        ]
        await add_member(
-            session=session,
-            i=0,
-            team_id=team_data["team_id"],
-            user_id=None,
-            user_email=new_normal_user,
+            session=session, i=0, team_id=team_data["team_id"], members=members
        )
+        ## ASSERT TEAM SIZE
+        team_info = await get_team_info(
+            session=session, get_team=team_data["team_id"], call_key="sk-1234"
+        )
+
+        assert len(team_info["team_info"]["members_with_roles"]) == 12

        ## CHANGE TEAM ALIAS

@ -570,4 +578,4 @@ async def test_users_in_team_budget():
        except Exception as e:
            print("got exception, this is expected")
            print(e)
-            assert "Crossed spend within team" in str(e)
+            assert "Budget has been exceeded" in str(e)
--- a/ui/litellm-dashboard/out/404.html
+++ b/ui/litellm-dashboard/out/404.html
--- a/ui/litellm-dashboard/out/_next/static/48nWsJi-LJrUlOLzcK-Yz/_buildManifest.js
+++ b/ui/litellm-dashboard/out/_next/static/48nWsJi-LJrUlOLzcK-Yz/_buildManifest.js
--- a/ui/litellm-dashboard/out/_next/static/48nWsJi-LJrUlOLzcK-Yz/_ssgManifest.js
+++ b/ui/litellm-dashboard/out/_next/static/48nWsJi-LJrUlOLzcK-Yz/_ssgManifest.js
--- a/ui/litellm-dashboard/out/_next/static/chunks/app/page-bd882aee817406ff.js
+++ b/ui/litellm-dashboard/out/_next/static/chunks/app/page-bd882aee817406ff.js
--- a/ui/litellm-dashboard/out/_next/static/chunks/app/page-d61796ff0d3a8faf.js
+++ b/ui/litellm-dashboard/out/_next/static/chunks/app/page-d61796ff0d3a8faf.js
--- a/ui/litellm-dashboard/out/index.html
+++ b/ui/litellm-dashboard/out/index.html
@ -1 +1 @@
-<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-d12f0c7c134d3e60.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-f960ab1e6d32b002.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-04708d7d4a17c1ee.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-d12f0c7c134d3e60.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/f02cb03d96e276ef.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[45980,[\"936\",\"static/chunks/2f6dbc85-052c4579f80d66ae.js\",\"294\",\"static/chunks/294-0e35509d5ca95267.js\",\"131\",\"static/chunks/131-6a03368053f9d26d.js\",\"684\",\"static/chunks/684-bb2d2f93d92acb0b.js\",\"759\",\"static/chunks/759-83a8bdddfe32b5d9.js\",\"777\",\"static/chunks/777-17b0c91edd3a24fe.js\",\"931\",\"static/chunks/app/page-d61796ff0d3a8faf.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/f02cb03d96e276ef.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"tghLG7_IS7i5OkQJRvCIl\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
+<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-d12f0c7c134d3e60.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-f960ab1e6d32b002.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-04708d7d4a17c1ee.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-d12f0c7c134d3e60.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/f02cb03d96e276ef.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[45980,[\"936\",\"static/chunks/2f6dbc85-052c4579f80d66ae.js\",\"294\",\"static/chunks/294-0e35509d5ca95267.js\",\"131\",\"static/chunks/131-6a03368053f9d26d.js\",\"684\",\"static/chunks/684-bb2d2f93d92acb0b.js\",\"759\",\"static/chunks/759-83a8bdddfe32b5d9.js\",\"777\",\"static/chunks/777-17b0c91edd3a24fe.js\",\"931\",\"static/chunks/app/page-bd882aee817406ff.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/f02cb03d96e276ef.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"48nWsJi-LJrUlOLzcK-Yz\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
--- a/ui/litellm-dashboard/out/index.txt
+++ b/ui/litellm-dashboard/out/index.txt
@ -1,7 +1,7 @@
 2:I[77831,[],""]
-3:I[45980,["936","static/chunks/2f6dbc85-052c4579f80d66ae.js","294","static/chunks/294-0e35509d5ca95267.js","131","static/chunks/131-6a03368053f9d26d.js","684","static/chunks/684-bb2d2f93d92acb0b.js","759","static/chunks/759-83a8bdddfe32b5d9.js","777","static/chunks/777-17b0c91edd3a24fe.js","931","static/chunks/app/page-d61796ff0d3a8faf.js"],""]
+3:I[45980,["936","static/chunks/2f6dbc85-052c4579f80d66ae.js","294","static/chunks/294-0e35509d5ca95267.js","131","static/chunks/131-6a03368053f9d26d.js","684","static/chunks/684-bb2d2f93d92acb0b.js","759","static/chunks/759-83a8bdddfe32b5d9.js","777","static/chunks/777-17b0c91edd3a24fe.js","931","static/chunks/app/page-bd882aee817406ff.js"],""]
 4:I[5613,[],""]
 5:I[31778,[],""]
-0:["tghLG7_IS7i5OkQJRvCIl",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/f02cb03d96e276ef.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
+0:["48nWsJi-LJrUlOLzcK-Yz",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/f02cb03d96e276ef.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
 6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
 1:null
--- a/ui/litellm-dashboard/out/model_hub.html
+++ b/ui/litellm-dashboard/out/model_hub.html
--- a/ui/litellm-dashboard/out/model_hub.txt
+++ b/ui/litellm-dashboard/out/model_hub.txt
@ -2,6 +2,6 @@
 3:I[87494,["294","static/chunks/294-0e35509d5ca95267.js","131","static/chunks/131-6a03368053f9d26d.js","777","static/chunks/777-17b0c91edd3a24fe.js","418","static/chunks/app/model_hub/page-4cb65c32467214b5.js"],""]
 4:I[5613,[],""]
 5:I[31778,[],""]
-0:["tghLG7_IS7i5OkQJRvCIl",[[["",{"children":["model_hub",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["model_hub",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","model_hub","children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined","styles":null}]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/f02cb03d96e276ef.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
+0:["48nWsJi-LJrUlOLzcK-Yz",[[["",{"children":["model_hub",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["model_hub",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","model_hub","children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined","styles":null}]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/f02cb03d96e276ef.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
 6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
 1:null
--- a/ui/litellm-dashboard/out/onboarding.html
+++ b/ui/litellm-dashboard/out/onboarding.html
--- a/ui/litellm-dashboard/out/onboarding.txt
+++ b/ui/litellm-dashboard/out/onboarding.txt
@ -2,6 +2,6 @@
 3:I[667,["665","static/chunks/3014691f-589a5f4865c3822f.js","294","static/chunks/294-0e35509d5ca95267.js","684","static/chunks/684-bb2d2f93d92acb0b.js","777","static/chunks/777-17b0c91edd3a24fe.js","461","static/chunks/app/onboarding/page-664c7288e11fff5a.js"],""]
 4:I[5613,[],""]
 5:I[31778,[],""]
-0:["tghLG7_IS7i5OkQJRvCIl",[[["",{"children":["onboarding",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["onboarding",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","onboarding","children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined","styles":null}]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/f02cb03d96e276ef.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
+0:["48nWsJi-LJrUlOLzcK-Yz",[[["",{"children":["onboarding",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["onboarding",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","onboarding","children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined","styles":null}]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/f02cb03d96e276ef.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
 6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
 1:null
--- a/ui/litellm-dashboard/src/components/chat_ui.tsx
+++ b/ui/litellm-dashboard/src/components/chat_ui.tsx
@ -119,9 +119,24 @@ const ChatUI: React.FC<ChatUIProps> = ({
  
          // Now, 'options' contains the list you wanted
          console.log(options); // You can log it to verify the list
-          
-          // setModelInfo(options) should be inside the if block to avoid setting it when no data is available
-          setModelInfo(options);
+
+          // if options.length > 0, only store unique values
+          if (options.length > 0) {
+            const uniqueModels = Array.from(new Set(options));
+
+            console.log("Unique models:", uniqueModels);
+
+            // sort uniqueModels alphabetically
+            uniqueModels.sort((a: any, b: any) => a.label.localeCompare(b.label));
+
+
+            console.log("Model info:", modelInfo);
+            
+            // setModelInfo(options) should be inside the if block to avoid setting it when no data is available
+            setModelInfo(uniqueModels);
+          }
+
+
          setSelectedModel(fetchedAvailableModels.data[0].id);
        }
      } catch (error) {
--- a/ui/litellm-dashboard/src/components/model_dashboard.tsx
+++ b/ui/litellm-dashboard/src/components/model_dashboard.tsx
@ -1130,7 +1130,7 @@ const ModelDashboard: React.FC<ModelDashboardProps> = ({
                              setSelectedAPIKey(key);
                            }}
                          >
-                            ✨ {key["key_alias"]} (Enterpise only Feature) 
+                            ✨ {key["key_alias"]} (Enterprise only Feature)
                          </SelectItem>
                        );
                      }
@ -1165,7 +1165,7 @@ const ModelDashboard: React.FC<ModelDashboardProps> = ({
                      setSelectedCustomer(user);
                    }}
                  >
-                    ✨ {user} (Enterpise only Feature) 
+                    ✨ {user} (Enterprise only Feature)
                  </SelectItem>
                );
              })
--- a/ui/litellm-dashboard/src/components/navbar.tsx
+++ b/ui/litellm-dashboard/src/components/navbar.tsx
@ -114,7 +114,7 @@ const Navbar: React.FC<NavbarProps> = ({
                textDecoration: "underline",
              }}
            >
-              Get enterpise license
+              Get enterprise license
            </a>
          </div>
        ) : null}
--- a/ui/litellm-dashboard/src/components/usage.tsx
+++ b/ui/litellm-dashboard/src/components/usage.tsx
@ -832,7 +832,7 @@ const UsagePage: React.FC<UsagePageProps> = ({
                                  // @ts-ignore
                                  disabled={true} 
                                >
-                                  ✨ {tag} (Enterpise only Feature)
+                                  ✨ {tag} (Enterprise only Feature)
                                </SelectItem>
                              );
                            })}