Merge branch 'main' into litellm_aws_kms_fixes

2024-06-10 20:17:34 -07:00 · 2024-06-10 20:17:34 -07:00 · 4475d2e5b5
commit 4475d2e5b5
parent e6c96aa950 8d18583c67
34 changed files with 1293 additions and 483 deletions
--- a/docs/my-website/docs/assistants.md
+++ b/docs/my-website/docs/assistants.md
@ -150,7 +150,7 @@ $ litellm --config /path/to/config.yaml
 ```bash
 curl "http://0.0.0.0:4000/v1/assistants?order=desc&limit=20" \
  -H "Content-Type: application/json" \
-  -H "Authorization: Bearer sk-1234" \
+  -H "Authorization: Bearer sk-1234"
 ```

 **Create a Thread**
@ -162,6 +162,14 @@ curl http://0.0.0.0:4000/v1/threads \
  -d ''
 ```

+**Get a Thread**
+
+```bash
+curl http://0.0.0.0:4000/v1/threads/{thread_id} \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer sk-1234"
+```
+
 **Add Messages to the Thread**

 ```bash
--- a/docs/my-website/docs/caching/all_caches.md
+++ b/docs/my-website/docs/caching/all_caches.md
@ -212,6 +212,94 @@ If you run the code two times, response1 will use the cache from the first run t

 </TabItem>

+</Tabs>
+
+## Switch Cache On / Off Per LiteLLM Call 
+
+LiteLLM supports 4 cache-controls:
+
+- `no-cache`: *Optional(bool)* When `True`, Will not return a cached response, but instead call the actual endpoint. 
+- `no-store`: *Optional(bool)* When `True`, Will not cache the response. 
+- `ttl`: *Optional(int)* - Will cache the response for the user-defined amount of time (in seconds).
+- `s-maxage`: *Optional(int)* Will only accept cached responses that are within user-defined range (in seconds).
+
+[Let us know if you need more](https://github.com/BerriAI/litellm/issues/1218)
+<Tabs>
+<TabItem value="no-cache" label="No-Cache">
+
+Example usage `no-cache` - When `True`, Will not return a cached response
+
+```python
+response = litellm.completion(
+        model="gpt-3.5-turbo",
+        messages=[
+            {
+                "role": "user",
+                "content": "hello who are you"
+            }
+        ],
+        cache={"no-cache": True},
+    )
+```
+
+</TabItem>
+
+<TabItem value="no-store" label="No-Store">
+
+Example usage `no-store` - When `True`, Will not cache the response. 
+
+```python
+response = litellm.completion(
+        model="gpt-3.5-turbo",
+        messages=[
+            {
+                "role": "user",
+                "content": "hello who are you"
+            }
+        ],
+        cache={"no-store": True},
+    )
+```
+
+</TabItem>
+
+<TabItem value="ttl" label="ttl">
+Example usage `ttl` - cache the response for 10 seconds
+
+```python
+response = litellm.completion(
+        model="gpt-3.5-turbo",
+        messages=[
+            {
+                "role": "user",
+                "content": "hello who are you"
+            }
+        ],
+        cache={"ttl": 10},
+    )
+```
+
+</TabItem>
+
+<TabItem value="s-maxage" label="s-maxage">
+Example usage `s-maxage` - Will only accept cached responses for 60 seconds
+
+```python
+response = litellm.completion(
+        model="gpt-3.5-turbo",
+        messages=[
+            {
+                "role": "user",
+                "content": "hello who are you"
+            }
+        ],
+        cache={"s-maxage": 60},
+    )
+```
+
+</TabItem>
+
+
 </Tabs>

 ## Cache Context Manager - Enable, Disable, Update Cache
--- a/docs/my-website/docs/projects/llm_cord.md
+++ b/docs/my-website/docs/projects/llm_cord.md
@ -1,3 +1,5 @@
+# llmcord.py
+
 llmcord.py lets you and your friends chat with LLMs directly in your Discord server. It works with practically any LLM, remote or locally hosted.

 Github: https://github.com/jakobdylanc/discord-llm-chatbot
--- a/docs/my-website/docs/proxy/cost_tracking.md
+++ b/docs/my-website/docs/proxy/cost_tracking.md
@ -138,14 +138,22 @@ Navigate to the Usage Tab on the LiteLLM UI (found on https://your-proxy-endpoin
 <Image img={require('../../img/admin_ui_spend.png')} />

 ## API Endpoints to get Spend
-#### Getting Spend Reports - To Charge Other Teams, API Keys
+#### Getting Spend Reports - To Charge Other Teams, Customers

-Use the `/global/spend/report` endpoint to get daily spend per team, with a breakdown of spend per API Key, Model
+Use the `/global/spend/report` endpoint to get daily spend report per 
+- team
+- customer [this is `user` passed to `/chat/completions` request](#how-to-track-spend-with-litellm)
+
+<Tabs>
+
+<TabItem value="per team" label="Spend Per Team">

 ##### Example Request

+👉 Key Change: Specify `group_by=team`
+
 ```shell
-curl -X GET 'http://localhost:4000/global/spend/report?start_date=2024-04-01&end_date=2024-06-30' \
+curl -X GET 'http://localhost:4000/global/spend/report?start_date=2024-04-01&end_date=2024-06-30&group_by=team' \
  -H 'Authorization: Bearer sk-1234'
 ```

@ -254,6 +262,69 @@ Output from script
 ```


+</TabItem>
+
+</Tabs>
+
+</TabItem>
+
+
+<TabItem value="per customer" label="Spend Per Customer">
+
+##### Example Request
+
+👉 Key Change: Specify `group_by=customer`
+
+
+```shell
+curl -X GET 'http://localhost:4000/global/spend/report?start_date=2024-04-01&end_date=2024-06-30&group_by=customer' \
+  -H 'Authorization: Bearer sk-1234'
+```
+
+##### Example Response
+
+
+```shell
+[
+    {
+        "group_by_day": "2024-04-30T00:00:00+00:00",
+        "customers": [
+            {
+                "customer": "palantir",
+                "total_spend": 0.0015265,
+                "metadata": [ # see the spend by unique(key + model)
+                    {
+                        "model": "gpt-4",
+                        "spend": 0.00123,
+                        "total_tokens": 28,
+                        "api_key": "88dc28.." # the hashed api key
+                    },
+                    {
+                        "model": "gpt-4",
+                        "spend": 0.00123,
+                        "total_tokens": 28,
+                        "api_key": "a73dc2.." # the hashed api key
+                    },
+                    {
+                        "model": "chatgpt-v-2",
+                        "spend": 0.000214,
+                        "total_tokens": 122,
+                        "api_key": "898c28.." # the hashed api key
+                    },
+                    {
+                        "model": "gpt-3.5-turbo",
+                        "spend": 0.0000825,
+                        "total_tokens": 85,
+                        "api_key": "84dc28.." # the hashed api key
+                    }
+                ]
+            }
+        ]
+    }
+]
+```
+
+
 </TabItem>

 </Tabs>
--- a/docs/my-website/docs/proxy/debugging.md
+++ b/docs/my-website/docs/proxy/debugging.md
@ -42,6 +42,14 @@ Set `JSON_LOGS="True"` in your env:
 ```bash
 export JSON_LOGS="True"
 ```
+**OR**
+
+Set `json_logs: true` in your yaml: 
+
+```yaml
+litellm_settings:
+    json_logs: true
+```

 Start proxy 

@ -50,3 +58,34 @@ $ litellm
 ```

 The proxy will now all logs in json format.
+
+## Control Log Output 
+
+Turn off fastapi's default 'INFO' logs 
+
+1. Turn on 'json logs' 
+```yaml
+litellm_settings:
+    json_logs: true
+```
+
+2. Set `LITELLM_LOG` to 'ERROR' 
+
+Only get logs if an error occurs. 
+
+```bash
+LITELLM_LOG="ERROR"
+```
+
+3. Start proxy 
+
+
+```bash
+$ litellm
+```
+
+Expected Output: 
+
+```bash
+# no info statements
+```
--- a/docs/my-website/docs/proxy/multiple_admins.md
+++ b/docs/my-website/docs/proxy/multiple_admins.md
@ -2,11 +2,21 @@

 Call management endpoints on behalf of a user. (Useful when connecting proxy to your development platform).

-:::info 
-Requires Enterprise License for usage.
+
+:::tip
+
+Requires Enterprise License, Get in touch with us [here](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
+
 :::

-## Set `LiteLLM-Changed-By` in request headers
+## 1. Switch on audit Logs 
+Add `store_audit_logs` to your litellm config.yaml and then start the proxy
+```shell
+litellm_settings:
+  store_audit_logs: true
+```
+
+## 2. Set `LiteLLM-Changed-By` in request headers

 Set the 'user_id' in request headers, when calling a management endpoint. [View Full List](https://litellm-api.up.railway.app/#/team%20management).

@ -26,7 +36,7 @@ curl -X POST 'http://0.0.0.0:4000/team/update' \
    }'
 ```

-## Emitted Audit Log 
+## 3. Emitted Audit Log 

 ```bash
 {
--- a/docs/my-website/docs/proxy/prod.md
+++ b/docs/my-website/docs/proxy/prod.md
@ -21,6 +21,7 @@ general_settings:

 litellm_settings:
  set_verbose: False      # Switch off Debug Logging, ensure your logs do not have any debugging on
+  json_logs: true         # Get debug logs in json format
 ```

 Set slack webhook url in your env
@ -28,6 +29,11 @@ Set slack webhook url in your env
 export SLACK_WEBHOOK_URL="https://hooks.slack.com/services/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"
 ```

+Turn off FASTAPI's default info logs
+```bash
+export LITELLM_LOG="ERROR"
+```
+
 :::info

 Need Help or want dedicated support ? Talk to a founder [here]: (https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
--- a/docs/my-website/docs/proxy/reliability.md
+++ b/docs/my-website/docs/proxy/reliability.md
@ -2,18 +2,13 @@ import Image from '@theme/IdealImage';
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# 🔥 Fallbacks, Retries, Timeouts, Load Balancing
+# 🔥 Load Balancing, Fallbacks, Retries, Timeouts

-Retry call with multiple instances of the same model.
-
-If a call fails after num_retries, fall back to another model group.
-
-If the error is a context window exceeded error, fall back to a larger model group (if given).
-
-[**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/router.py)
+- Quick Start [load balancing](#test---load-balancing)
+- Quick Start [client side fallbacks](#test---client-side-fallbacks)

 ## Quick Start - Load Balancing
-### Step 1 - Set deployments on config
+#### Step 1 - Set deployments on config

 **Example config below**. Here requests with `model=gpt-3.5-turbo` will be routed across multiple instances of `azure/gpt-3.5-turbo`
 ```yaml
@ -38,14 +33,49 @@ model_list:
      rpm: 1440
 ```

-### Step 2: Start Proxy with config
+#### Step 2: Start Proxy with config

 ```shell
 $ litellm --config /path/to/config.yaml
 ```

-### Step 3: Use proxy - Call a model group [Load Balancing]
-Curl Command
+### Test - Load Balancing
+
+Here requests with model=gpt-3.5-turbo will be routed across multiple instances of azure/gpt-3.5-turbo
+
+👉 Key Change: `model="gpt-3.5-turbo"`
+
+**Check the `model_id` in Response Headers to make sure the requests are being load balanced**
+
+<Tabs>
+
+<TabItem value="openai" label="OpenAI Python v1.0.0+">
+
+```python
+import openai
+client = openai.OpenAI(
+    api_key="anything",
+    base_url="http://0.0.0.0:4000"
+)
+
+response = client.chat.completions.create(
+    model="gpt-3.5-turbo",
+    messages = [
+        {
+            "role": "user",
+            "content": "this is a test request, write a short poem"
+        }
+    ]
+)
+
+print(response)
+```
+</TabItem>
+
+<TabItem value="Curl" label="Curl Request">
+
+Pass `metadata` as part of the request body
+
 ```shell
 curl --location 'http://0.0.0.0:4000/chat/completions' \
    --header 'Content-Type: application/json' \
@ -56,32 +86,167 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
        "role": "user",
        "content": "what llm are you"
        }
-      ],
-    }
-'
+    ]
+}'
+```
+</TabItem>
+<TabItem value="langchain" label="Langchain">
+
+```python
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts.chat import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+    SystemMessagePromptTemplate,
+)
+from langchain.schema import HumanMessage, SystemMessage
+import os 
+
+os.environ["OPENAI_API_KEY"] = "anything"
+
+chat = ChatOpenAI(
+    openai_api_base="http://0.0.0.0:4000",
+    model="gpt-3.5-turbo",
+)
+
+messages = [
+    SystemMessage(
+        content="You are a helpful assistant that im using to make a test request to."
+    ),
+    HumanMessage(
+        content="test from litellm. tell me why it's amazing in 1 sentence"
+    ),
+]
+response = chat(messages)
+
+print(response)
 ```

-### Usage - Call a specific model deployment
-If you want to call a specific model defined in the `config.yaml`, you can call the `litellm_params: model`
+</TabItem>

-In this example it will call `azure/gpt-turbo-small-ca`. Defined in the config on Step 1
+</Tabs>

-```bash
+
+### Test - Client Side Fallbacks
+In this request the following will occur:
+1. The request to `model="zephyr-beta"` will fail
+2. litellm proxy will loop through all the model_groups specified in `fallbacks=["gpt-3.5-turbo"]`
+3. The request to `model="gpt-3.5-turbo"` will succeed and the client making the request will get a response from gpt-3.5-turbo 
+
+👉 Key Change: `"fallbacks": ["gpt-3.5-turbo"]`
+
+<Tabs>
+
+<TabItem value="openai" label="OpenAI Python v1.0.0+">
+
+```python
+import openai
+client = openai.OpenAI(
+    api_key="anything",
+    base_url="http://0.0.0.0:4000"
+)
+
+response = client.chat.completions.create(
+    model="zephyr-beta",
+    messages = [
+        {
+            "role": "user",
+            "content": "this is a test request, write a short poem"
+        }
+    ],
+    extra_body={
+        "metadata": {
+            "fallbacks": ["gpt-3.5-turbo"]
+        }
+    }
+)
+
+print(response)
+```
+</TabItem>
+
+<TabItem value="Curl" label="Curl Request">
+
+Pass `metadata` as part of the request body
+
+```shell
 curl --location 'http://0.0.0.0:4000/chat/completions' \
    --header 'Content-Type: application/json' \
    --data '{
-      "model": "azure/gpt-turbo-small-ca",
+    "model": "zephyr-beta"",
    "messages": [
        {
        "role": "user",
        "content": "what llm are you"
        }
    ],
+    "metadata": {
+        "fallbacks": ["gpt-3.5-turbo"]
    }
-'
+}'
+```
+</TabItem>
+<TabItem value="langchain" label="Langchain">
+
+```python
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts.chat import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+    SystemMessagePromptTemplate,
+)
+from langchain.schema import HumanMessage, SystemMessage
+import os 
+
+os.environ["OPENAI_API_KEY"] = "anything"
+
+chat = ChatOpenAI(
+    openai_api_base="http://0.0.0.0:4000",
+    model="zephyr-beta",
+    extra_body={
+        "metadata": {
+            "fallbacks": ["gpt-3.5-turbo"]
+        }
+    }
+)
+
+messages = [
+    SystemMessage(
+        content="You are a helpful assistant that im using to make a test request to."
+    ),
+    HumanMessage(
+        content="test from litellm. tell me why it's amazing in 1 sentence"
+    ),
+]
+response = chat(messages)
+
+print(response)
 ```

-## Fallbacks + Retries + Timeouts + Cooldowns
+</TabItem>
+
+</Tabs>
+
+
+
+<!-- 
+### Test it!
+
+
+```bash
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+     --header 'Content-Type: application/json' \
+     --data-raw '{
+        "model": "zephyr-beta", # 👈 MODEL NAME to fallback from
+        "messages": [
+            {"role": "user", "content": "what color is red"}
+        ],
+        "mock_testing_fallbacks": true
+     }'
+``` -->
+
+## Advanced
+### Fallbacks + Retries + Timeouts + Cooldowns

 **Set via config**
 ```yaml
@ -114,44 +279,7 @@ litellm_settings:
  context_window_fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}] # fallback to gpt-3.5-turbo-16k if context window error
  allowed_fails: 3 # cooldown model if it fails > 1 call in a minute. 
 ```
-
-**Set dynamically**
-
-```bash
-curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
-      "model": "zephyr-beta",
-      "messages": [
-        {
-          "role": "user",
-          "content": "what llm are you"
-        }
-      ],
-      "fallbacks": [{"zephyr-beta": ["gpt-3.5-turbo"]}],
-      "context_window_fallbacks": [{"zephyr-beta": ["gpt-3.5-turbo"]}],
-      "num_retries": 2,
-      "timeout": 10
-    }
-'
-```
-
-### Test it!
-
-
-```bash
-curl --location 'http://0.0.0.0:4000/chat/completions' \
-     --header 'Content-Type: application/json' \
-     --data-raw '{
-        "model": "zephyr-beta", # 👈 MODEL NAME to fallback from
-        "messages": [
-            {"role": "user", "content": "what color is red"}
-        ],
-        "mock_testing_fallbacks": true
-     }'
-```
-
-## Advanced - Context Window Fallbacks (Pre-Call Checks + Fallbacks)
+### Context Window Fallbacks (Pre-Call Checks + Fallbacks)

 **Before call is made** check if a call is within model context window with  **`enable_pre_call_checks: true`**.

@ -287,7 +415,7 @@ print(response)
 </Tabs>


-## Advanced - EU-Region Filtering (Pre-Call Checks)
+### EU-Region Filtering (Pre-Call Checks)

 **Before call is made** check if a call is within model context window with  **`enable_pre_call_checks: true`**.

@ -350,7 +478,7 @@ print(response)
 print(f"response.headers.get('x-litellm-model-api-base')")
 ```

-## Advanced - Custom Timeouts, Stream Timeouts - Per Model
+### Custom Timeouts, Stream Timeouts - Per Model
 For each model you can set `timeout` & `stream_timeout` under `litellm_params`
 ```yaml
 model_list:
@ -379,7 +507,7 @@ $ litellm --config /path/to/config.yaml
 ```


-## Advanced - Setting Dynamic Timeouts - Per Request
+### Setting Dynamic Timeouts - Per Request

 LiteLLM Proxy supports setting a `timeout` per request 

--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -255,6 +255,7 @@ const sidebars = {
            "projects/GPT Migrate",
            "projects/YiVal",
            "projects/LiteLLM Proxy",
+            "projects/llm_cord",
          ],
        },
      ],
--- a/litellm/init.py
+++ b/litellm/init.py
@ -709,6 +709,7 @@ all_embedding_models = (
 openai_image_generation_models = ["dall-e-2", "dall-e-3"]

 from .timeout import timeout
+from .cost_calculator import completion_cost
 from .utils import (
    client,
    exception_type,
@ -718,7 +719,6 @@ from .utils import (
    create_pretrained_tokenizer,
    create_tokenizer,
    cost_per_token,
-    completion_cost,
    supports_function_calling,
    supports_parallel_function_calling,
    supports_vision,
--- a/litellm/cost_calculator.py
+++ b/litellm/cost_calculator.py
@ -1,6 +1,7 @@
 # What is this?
 ## File for 'response_cost' calculation in Logging
-from typing import Optional, Union, Literal
+from typing import Optional, Union, Literal, List
+import litellm._logging
 from litellm.utils import (
    ModelResponse,
    EmbeddingResponse,
@ -8,10 +9,281 @@ from litellm.utils import (
    TranscriptionResponse,
    TextCompletionResponse,
    CallTypes,
-    completion_cost,
+    cost_per_token,
    print_verbose,
+    CostPerToken,
+    token_counter,
 )
 import litellm
+from litellm import verbose_logger
+
+
+# Extract the number of billion parameters from the model name
+# only used for together_computer LLMs
+def get_model_params_and_category(model_name) -> str:
+    """
+    Helper function for calculating together ai pricing.
+
+    Returns
+    - str - model pricing category if mapped else received model name
+    """
+    import re
+
+    model_name = model_name.lower()
+    re_params_match = re.search(
+        r"(\d+b)", model_name
+    )  # catch all decimals like 3b, 70b, etc
+    category = None
+    if re_params_match is not None:
+        params_match = str(re_params_match.group(1))
+        params_match = params_match.replace("b", "")
+        if params_match is not None:
+            params_billion = float(params_match)
+        else:
+            return model_name
+        # Determine the category based on the number of parameters
+        if params_billion <= 4.0:
+            category = "together-ai-up-to-4b"
+        elif params_billion <= 8.0:
+            category = "together-ai-4.1b-8b"
+        elif params_billion <= 21.0:
+            category = "together-ai-8.1b-21b"
+        elif params_billion <= 41.0:
+            category = "together-ai-21.1b-41b"
+        elif params_billion <= 80.0:
+            category = "together-ai-41.1b-80b"
+        elif params_billion <= 110.0:
+            category = "together-ai-81.1b-110b"
+        if category is not None:
+            return category
+
+    return model_name
+
+
+def get_replicate_completion_pricing(completion_response=None, total_time=0.0):
+    # see https://replicate.com/pricing
+    # for all litellm currently supported LLMs, almost all requests go to a100_80gb
+    a100_80gb_price_per_second_public = (
+        0.001400  # assume all calls sent to A100 80GB for now
+    )
+    if total_time == 0.0:  # total time is in ms
+        start_time = completion_response["created"]
+        end_time = getattr(completion_response, "ended", time.time())
+        total_time = end_time - start_time
+
+    return a100_80gb_price_per_second_public * total_time / 1000
+
+
+def completion_cost(
+    completion_response=None,
+    model: Optional[str] = None,
+    prompt="",
+    messages: List = [],
+    completion="",
+    total_time=0.0,  # used for replicate, sagemaker
+    call_type: Literal[
+        "embedding",
+        "aembedding",
+        "completion",
+        "acompletion",
+        "atext_completion",
+        "text_completion",
+        "image_generation",
+        "aimage_generation",
+        "moderation",
+        "amoderation",
+        "atranscription",
+        "transcription",
+        "aspeech",
+        "speech",
+    ] = "completion",
+    ### REGION ###
+    custom_llm_provider=None,
+    region_name=None,  # used for bedrock pricing
+    ### IMAGE GEN ###
+    size=None,
+    quality=None,
+    n=None,  # number of images
+    ### CUSTOM PRICING ###
+    custom_cost_per_token: Optional[CostPerToken] = None,
+    custom_cost_per_second: Optional[float] = None,
+) -> float:
+    """
+    Calculate the cost of a given completion call fot GPT-3.5-turbo, llama2, any litellm supported llm.
+
+    Parameters:
+        completion_response (litellm.ModelResponses): [Required] The response received from a LiteLLM completion request.
+
+        [OPTIONAL PARAMS]
+        model (str): Optional. The name of the language model used in the completion calls
+        prompt (str): Optional. The input prompt passed to the llm
+        completion (str): Optional. The output completion text from the llm
+        total_time (float): Optional. (Only used for Replicate LLMs) The total time used for the request in seconds
+        custom_cost_per_token: Optional[CostPerToken]: the cost per input + output token for the llm api call.
+        custom_cost_per_second: Optional[float]: the cost per second for the llm api call.
+
+    Returns:
+        float: The cost in USD dollars for the completion based on the provided parameters.
+
+    Exceptions:
+        Raises exception if model not in the litellm model cost map. Register model, via custom pricing or PR - https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json
+
+
+    Note:
+        - If completion_response is provided, the function extracts token information and the model name from it.
+        - If completion_response is not provided, the function calculates token counts based on the model and input text.
+        - The cost is calculated based on the model, prompt tokens, and completion tokens.
+        - For certain models containing "togethercomputer" in the name, prices are based on the model size.
+        - For un-mapped Replicate models, the cost is calculated based on the total time used for the request.
+    """
+    try:
+        if (
+            (call_type == "aimage_generation" or call_type == "image_generation")
+            and model is not None
+            and isinstance(model, str)
+            and len(model) == 0
+            and custom_llm_provider == "azure"
+        ):
+            model = "dall-e-2"  # for dall-e-2, azure expects an empty model name
+        # Handle Inputs to completion_cost
+        prompt_tokens = 0
+        completion_tokens = 0
+        custom_llm_provider = None
+        if completion_response is not None:
+            # get input/output tokens from completion_response
+            prompt_tokens = completion_response.get("usage", {}).get("prompt_tokens", 0)
+            completion_tokens = completion_response.get("usage", {}).get(
+                "completion_tokens", 0
+            )
+            total_time = completion_response.get("_response_ms", 0)
+            verbose_logger.debug(
+                f"completion_response response ms: {completion_response.get('_response_ms')} "
+            )
+            model = model or completion_response.get(
+                "model", None
+            )  # check if user passed an override for model, if it's none check completion_response['model']
+            if hasattr(completion_response, "_hidden_params"):
+                if (
+                    completion_response._hidden_params.get("model", None) is not None
+                    and len(completion_response._hidden_params["model"]) > 0
+                ):
+                    model = completion_response._hidden_params.get("model", model)
+                custom_llm_provider = completion_response._hidden_params.get(
+                    "custom_llm_provider", ""
+                )
+                region_name = completion_response._hidden_params.get(
+                    "region_name", region_name
+                )
+                size = completion_response._hidden_params.get(
+                    "optional_params", {}
+                ).get(
+                    "size", "1024-x-1024"
+                )  # openai default
+                quality = completion_response._hidden_params.get(
+                    "optional_params", {}
+                ).get(
+                    "quality", "standard"
+                )  # openai default
+                n = completion_response._hidden_params.get("optional_params", {}).get(
+                    "n", 1
+                )  # openai default
+        else:
+            if len(messages) > 0:
+                prompt_tokens = token_counter(model=model, messages=messages)
+            elif len(prompt) > 0:
+                prompt_tokens = token_counter(model=model, text=prompt)
+            completion_tokens = token_counter(model=model, text=completion)
+        if model is None:
+            raise ValueError(
+                f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}"
+            )
+
+        if (
+            call_type == CallTypes.image_generation.value
+            or call_type == CallTypes.aimage_generation.value
+        ):
+            ### IMAGE GENERATION COST CALCULATION ###
+            if custom_llm_provider == "vertex_ai":
+                # https://cloud.google.com/vertex-ai/generative-ai/pricing
+                # Vertex Charges Flat $0.20 per image
+                return 0.020
+
+            # fix size to match naming convention
+            if "x" in size and "-x-" not in size:
+                size = size.replace("x", "-x-")
+            image_gen_model_name = f"{size}/{model}"
+            image_gen_model_name_with_quality = image_gen_model_name
+            if quality is not None:
+                image_gen_model_name_with_quality = f"{quality}/{image_gen_model_name}"
+            size = size.split("-x-")
+            height = int(size[0])  # if it's 1024-x-1024 vs. 1024x1024
+            width = int(size[1])
+            verbose_logger.debug(f"image_gen_model_name: {image_gen_model_name}")
+            verbose_logger.debug(
+                f"image_gen_model_name_with_quality: {image_gen_model_name_with_quality}"
+            )
+            if image_gen_model_name in litellm.model_cost:
+                return (
+                    litellm.model_cost[image_gen_model_name]["input_cost_per_pixel"]
+                    * height
+                    * width
+                    * n
+                )
+            elif image_gen_model_name_with_quality in litellm.model_cost:
+                return (
+                    litellm.model_cost[image_gen_model_name_with_quality][
+                        "input_cost_per_pixel"
+                    ]
+                    * height
+                    * width
+                    * n
+                )
+            else:
+                raise Exception(
+                    f"Model={image_gen_model_name} not found in completion cost model map"
+                )
+        # Calculate cost based on prompt_tokens, completion_tokens
+        if (
+            "togethercomputer" in model
+            or "together_ai" in model
+            or custom_llm_provider == "together_ai"
+        ):
+            # together ai prices based on size of llm
+            # get_model_params_and_category takes a model name and returns the category of LLM size it is in model_prices_and_context_window.json
+            model = get_model_params_and_category(model)
+        # replicate llms are calculate based on time for request running
+        # see https://replicate.com/pricing
+        elif (
+            model in litellm.replicate_models or "replicate" in model
+        ) and model not in litellm.model_cost:
+            # for unmapped replicate model, default to replicate's time tracking logic
+            return get_replicate_completion_pricing(completion_response, total_time)
+
+        if model is None:
+            raise ValueError(
+                f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}"
+            )
+
+        (
+            prompt_tokens_cost_usd_dollar,
+            completion_tokens_cost_usd_dollar,
+        ) = cost_per_token(
+            model=model,
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            custom_llm_provider=custom_llm_provider,
+            response_time_ms=total_time,
+            region_name=region_name,
+            custom_cost_per_second=custom_cost_per_second,
+            custom_cost_per_token=custom_cost_per_token,
+        )
+        _final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
+        print_verbose(
+            f"final cost: {_final_cost}; prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}; completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}"
+        )
+        return _final_cost
+    except Exception as e:
+        raise e


 def response_cost_calculator(
@ -47,7 +319,7 @@ def response_cost_calculator(
 ) -> Optional[float]:
    try:
        response_cost: float = 0.0
-        if cache_hit is not None and cache_hit == True:
+        if cache_hit is not None and cache_hit is True:
            response_cost = 0.0
        else:
            response_object._hidden_params["optional_params"] = optional_params
@ -62,9 +334,11 @@ def response_cost_calculator(
                if (
                    model in litellm.model_cost
                    and custom_pricing is not None
-                    and custom_llm_provider == True
+                    and custom_llm_provider is True
                ):  # override defaults if custom pricing is set
                    base_model = model
+                elif base_model is None:
+                    base_model = model
                # base_model defaults to None if not set on model_info
                response_cost = completion_cost(
                    completion_response=response_object,
--- a/litellm/exceptions.py
+++ b/litellm/exceptions.py
@ -20,7 +20,7 @@ class AuthenticationError(openai.AuthenticationError):  # type: ignore
        message,
        llm_provider,
        model,
-        response: httpx.Response,
+        response: Optional[httpx.Response] = None,
        litellm_debug_info: Optional[str] = None,
        max_retries: Optional[int] = None,
        num_retries: Optional[int] = None,
@ -32,8 +32,14 @@ class AuthenticationError(openai.AuthenticationError):  # type: ignore
        self.litellm_debug_info = litellm_debug_info
        self.max_retries = max_retries
        self.num_retries = num_retries
+        self.response = response or httpx.Response(
+            status_code=self.status_code,
+            request=httpx.Request(
+                method="GET", url="https://litellm.ai"
+            ),  # mock request object
+        )
        super().__init__(
-            self.message, response=response, body=None
+            self.message, response=self.response, body=None
        )  # Call the base class constructor with the parameters it needs

    def __str__(self):
@ -60,7 +66,7 @@ class NotFoundError(openai.NotFoundError):  # type: ignore
        message,
        model,
        llm_provider,
-        response: httpx.Response,
+        response: Optional[httpx.Response] = None,
        litellm_debug_info: Optional[str] = None,
        max_retries: Optional[int] = None,
        num_retries: Optional[int] = None,
@ -72,8 +78,14 @@ class NotFoundError(openai.NotFoundError):  # type: ignore
        self.litellm_debug_info = litellm_debug_info
        self.max_retries = max_retries
        self.num_retries = num_retries
+        self.response = response or httpx.Response(
+            status_code=self.status_code,
+            request=httpx.Request(
+                method="GET", url="https://litellm.ai"
+            ),  # mock request object
+        )
        super().__init__(
-            self.message, response=response, body=None
+            self.message, response=self.response, body=None
        )  # Call the base class constructor with the parameters it needs

    def __str__(self):
@ -262,7 +274,7 @@ class RateLimitError(openai.RateLimitError):  # type: ignore
        message,
        llm_provider,
        model,
-        response: httpx.Response,
+        response: Optional[httpx.Response] = None,
        litellm_debug_info: Optional[str] = None,
        max_retries: Optional[int] = None,
        num_retries: Optional[int] = None,
@ -274,8 +286,18 @@ class RateLimitError(openai.RateLimitError):  # type: ignore
        self.litellm_debug_info = litellm_debug_info
        self.max_retries = max_retries
        self.num_retries = num_retries
+        if response is None:
+            self.response = httpx.Response(
+                status_code=429,
+                request=httpx.Request(
+                    method="POST",
+                    url=" https://cloud.google.com/vertex-ai/",
+                ),
+            )
+        else:
+            self.response = response
        super().__init__(
-            self.message, response=response, body=None
+            self.message, response=self.response, body=None
        )  # Call the base class constructor with the parameters it needs

    def __str__(self):
@ -421,7 +443,7 @@ class ServiceUnavailableError(openai.APIStatusError):  # type: ignore
        message,
        llm_provider,
        model,
-        response: httpx.Response,
+        response: Optional[httpx.Response] = None,
        litellm_debug_info: Optional[str] = None,
        max_retries: Optional[int] = None,
        num_retries: Optional[int] = None,
@ -433,8 +455,18 @@ class ServiceUnavailableError(openai.APIStatusError):  # type: ignore
        self.litellm_debug_info = litellm_debug_info
        self.max_retries = max_retries
        self.num_retries = num_retries
+        if response is None:
+            self.response = httpx.Response(
+                status_code=self.status_code,
+                request=httpx.Request(
+                    method="POST",
+                    url=" https://cloud.google.com/vertex-ai/",
+                ),
+            )
+        else:
+            self.response = response
        super().__init__(
-            self.message, response=response, body=None
+            self.message, response=self.response, body=None
        )  # Call the base class constructor with the parameters it needs

    def __str__(self):
@ -460,7 +492,7 @@ class InternalServerError(openai.InternalServerError):  # type: ignore
        message,
        llm_provider,
        model,
-        response: httpx.Response,
+        response: Optional[httpx.Response] = None,
        litellm_debug_info: Optional[str] = None,
        max_retries: Optional[int] = None,
        num_retries: Optional[int] = None,
@ -472,8 +504,18 @@ class InternalServerError(openai.InternalServerError):  # type: ignore
        self.litellm_debug_info = litellm_debug_info
        self.max_retries = max_retries
        self.num_retries = num_retries
+        if response is None:
+            self.response = httpx.Response(
+                status_code=self.status_code,
+                request=httpx.Request(
+                    method="POST",
+                    url=" https://cloud.google.com/vertex-ai/",
+                ),
+            )
+        else:
+            self.response = response
        super().__init__(
-            self.message, response=response, body=None
+            self.message, response=self.response, body=None
        )  # Call the base class constructor with the parameters it needs

    def __str__(self):
--- a/litellm/integrations/opentelemetry.py
+++ b/litellm/integrations/opentelemetry.py
@ -366,8 +366,6 @@ class OpenTelemetry(CustomLogger):
                        )

                    message = choice.get("message")
-                    if not isinstance(message, dict):
-                        message = message.dict()
                    tool_calls = message.get("tool_calls")
                    if tool_calls:
                        span.set_attribute(
--- a/litellm/integrations/test_httpx.py
+++ b/litellm/integrations/test_httpx.py
--- a/litellm/llms/predibase.py
+++ b/litellm/llms/predibase.py
@ -3,6 +3,7 @@

 from functools import partial
 import os, types
+import traceback
 import json
 from enum import Enum
 import requests, copy  # type: ignore
@ -242,12 +243,12 @@ class PredibaseChatCompletion(BaseLLM):
                "details" in completion_response
                and "tokens" in completion_response["details"]
            ):
-                model_response.choices[0].finish_reason = completion_response[
-                    "details"
-                ]["finish_reason"]
+                model_response.choices[0].finish_reason = map_finish_reason(
+                    completion_response["details"]["finish_reason"]
+                )
                sum_logprob = 0
                for token in completion_response["details"]["tokens"]:
-                    if token["logprob"] != None:
+                    if token["logprob"] is not None:
                        sum_logprob += token["logprob"]
                model_response["choices"][0][
                    "message"
@ -265,7 +266,7 @@ class PredibaseChatCompletion(BaseLLM):
                    ):
                        sum_logprob = 0
                        for token in item["tokens"]:
-                            if token["logprob"] != None:
+                            if token["logprob"] is not None:
                                sum_logprob += token["logprob"]
                        if len(item["generated_text"]) > 0:
                            message_obj = Message(
@ -275,7 +276,7 @@ class PredibaseChatCompletion(BaseLLM):
                        else:
                            message_obj = Message(content=None)
                        choice_obj = Choices(
-                            finish_reason=item["finish_reason"],
+                            finish_reason=map_finish_reason(item["finish_reason"]),
                            index=idx + 1,
                            message=message_obj,
                        )
@ -285,10 +286,8 @@ class PredibaseChatCompletion(BaseLLM):
        ## CALCULATING USAGE
        prompt_tokens = 0
        try:
-            prompt_tokens = len(
-                encoding.encode(model_response["choices"][0]["message"]["content"])
-            )  ##[TODO] use a model-specific tokenizer here
-        except:
+            prompt_tokens = litellm.token_counter(messages=messages)
+        except Exception:
            # this should remain non blocking we should not block a response returning if calculating usage fails
            pass
        output_text = model_response["choices"][0]["message"].get("content", "")
@ -331,6 +330,7 @@ class PredibaseChatCompletion(BaseLLM):
        logging_obj,
        optional_params: dict,
        tenant_id: str,
+        timeout: Union[float, httpx.Timeout],
        acompletion=None,
        litellm_params=None,
        logger_fn=None,
@ -340,6 +340,7 @@ class PredibaseChatCompletion(BaseLLM):
        completion_url = ""
        input_text = ""
        base_url = "https://serving.app.predibase.com"
+
        if "https" in model:
            completion_url = model
        elif api_base:
@ -349,7 +350,7 @@ class PredibaseChatCompletion(BaseLLM):

        completion_url = f"{base_url}/{tenant_id}/deployments/v2/llms/{model}"

-        if optional_params.get("stream", False) == True:
+        if optional_params.get("stream", False) is True:
            completion_url += "/generate_stream"
        else:
            completion_url += "/generate"
@ -393,9 +394,9 @@ class PredibaseChatCompletion(BaseLLM):
            },
        )
        ## COMPLETION CALL
-        if acompletion == True:
+        if acompletion is True:
            ### ASYNC STREAMING
-            if stream == True:
+            if stream is True:
                return self.async_streaming(
                    model=model,
                    messages=messages,
@ -410,6 +411,7 @@ class PredibaseChatCompletion(BaseLLM):
                    litellm_params=litellm_params,
                    logger_fn=logger_fn,
                    headers=headers,
+                    timeout=timeout,
                )  # type: ignore
            else:
                ### ASYNC COMPLETION
@ -428,10 +430,11 @@ class PredibaseChatCompletion(BaseLLM):
                    litellm_params=litellm_params,
                    logger_fn=logger_fn,
                    headers=headers,
+                    timeout=timeout,
                )  # type: ignore

        ### SYNC STREAMING
-        if stream == True:
+        if stream is True:
            response = requests.post(
                completion_url,
                headers=headers,
@ -452,7 +455,6 @@ class PredibaseChatCompletion(BaseLLM):
                headers=headers,
                data=json.dumps(data),
            )
-
        return self.process_response(
            model=model,
            response=response,
@ -480,23 +482,26 @@ class PredibaseChatCompletion(BaseLLM):
        stream,
        data: dict,
        optional_params: dict,
+        timeout: Union[float, httpx.Timeout],
        litellm_params=None,
        logger_fn=None,
        headers={},
    ) -> ModelResponse:
-        self.async_handler = AsyncHTTPHandler(
-            timeout=httpx.Timeout(timeout=600.0, connect=5.0)
-        )
+
+        async_handler = AsyncHTTPHandler(timeout=httpx.Timeout(timeout=timeout))
        try:
-            response = await self.async_handler.post(
+            response = await async_handler.post(
                api_base, headers=headers, data=json.dumps(data)
            )
        except httpx.HTTPStatusError as e:
            raise PredibaseError(
-                status_code=e.response.status_code, message=e.response.text
+                status_code=e.response.status_code,
+                message="HTTPStatusError - {}".format(e.response.text),
            )
        except Exception as e:
-            raise PredibaseError(status_code=500, message=str(e))
+            raise PredibaseError(
+                status_code=500, message="{}\n{}".format(str(e), traceback.format_exc())
+            )
        return self.process_response(
            model=model,
            response=response,
@ -522,6 +527,7 @@ class PredibaseChatCompletion(BaseLLM):
        api_key,
        logging_obj,
        data: dict,
+        timeout: Union[float, httpx.Timeout],
        optional_params=None,
        litellm_params=None,
        logger_fn=None,
--- a/litellm/main.py
+++ b/litellm/main.py
@ -432,9 +432,9 @@ def mock_completion(
            if isinstance(mock_response, openai.APIError):
                raise mock_response
            raise litellm.APIError(
-                status_code=500,  # type: ignore
-                message=str(mock_response),
-                llm_provider="openai",  # type: ignore
+                status_code=getattr(mock_response, "status_code", 500),  # type: ignore
+                message=getattr(mock_response, "text", str(mock_response)),
+                llm_provider=getattr(mock_response, "llm_provider", "openai"),  # type: ignore
                model=model,  # type: ignore
                request=httpx.Request(method="POST", url="https://api.openai.com/v1/"),
            )
@ -1949,7 +1949,8 @@ def completion(
            )

            api_base = (
-                optional_params.pop("api_base", None)
+                api_base
+                or optional_params.pop("api_base", None)
                or optional_params.pop("base_url", None)
                or litellm.api_base
                or get_secret("PREDIBASE_API_BASE")
@ -1977,12 +1978,13 @@ def completion(
                custom_prompt_dict=custom_prompt_dict,
                api_key=api_key,
                tenant_id=tenant_id,
+                timeout=timeout,
            )

            if (
                "stream" in optional_params
-                and optional_params["stream"] == True
-                and acompletion == False
+                and optional_params["stream"] is True
+                and acompletion is False
            ):
                return _model_response
            response = _model_response
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -3009,32 +3009,37 @@
        "litellm_provider": "sagemaker",
        "mode": "chat"
    },
-    "together-ai-up-to-3b": {
+    "together-ai-up-to-4b": {
        "input_cost_per_token": 0.0000001,
        "output_cost_per_token": 0.0000001,
        "litellm_provider": "together_ai"
    },
-    "together-ai-3.1b-7b": {
+    "together-ai-4.1b-8b": {
        "input_cost_per_token": 0.0000002,
        "output_cost_per_token": 0.0000002,
        "litellm_provider": "together_ai"
    },
-    "together-ai-7.1b-20b": {
+    "together-ai-8.1b-21b": {
        "max_tokens": 1000,
-        "input_cost_per_token": 0.0000004,
-        "output_cost_per_token": 0.0000004,
+        "input_cost_per_token": 0.0000003,
+        "output_cost_per_token": 0.0000003,
        "litellm_provider": "together_ai"
    },
-    "together-ai-20.1b-40b": {
+    "together-ai-21.1b-41b": {
        "input_cost_per_token": 0.0000008,
        "output_cost_per_token": 0.0000008,
        "litellm_provider": "together_ai"
    },
-    "together-ai-40.1b-70b": {
+    "together-ai-41.1b-80b": {
        "input_cost_per_token": 0.0000009,
        "output_cost_per_token": 0.0000009,
        "litellm_provider": "together_ai"
    },
+    "together-ai-81.1b-110b": {
+        "input_cost_per_token": 0.0000018,
+        "output_cost_per_token": 0.0000018,
+        "litellm_provider": "together_ai"
+    },
    "together_ai/mistralai/Mixtral-8x7B-Instruct-v0.1": {
        "input_cost_per_token": 0.0000006,
        "output_cost_per_token": 0.0000006,
--- a/litellm/proxy/_logging.py
+++ b/litellm/proxy/_logging.py
@ -1,7 +1,12 @@
 import json
 import logging
 from logging import Formatter
-import sys
+import os
+from litellm import json_logs
+
+# Set default log level to INFO
+log_level = os.getenv("LITELLM_LOG", "INFO")
+numeric_level: str = getattr(logging, log_level.upper())


 class JsonFormatter(Formatter):
@ -16,6 +21,14 @@ class JsonFormatter(Formatter):

 logger = logging.root
 handler = logging.StreamHandler()
+if json_logs:
    handler.setFormatter(JsonFormatter())
+else:
+    formatter = logging.Formatter(
+        "\033[92m%(asctime)s - %(name)s:%(levelname)s\033[0m: %(filename)s:%(lineno)s - %(message)s",
+        datefmt="%H:%M:%S",
+    )
+
+    handler.setFormatter(formatter)
 logger.handlers = [handler]
-logger.setLevel(logging.INFO)
+logger.setLevel(numeric_level)
--- a/litellm/proxy/_super_secret_config.yaml
+++ b/litellm/proxy/_super_secret_config.yaml
@ -8,6 +8,17 @@ model_list:
 - model_name: llama3-70b-8192
  litellm_params:
    model: groq/llama3-70b-8192
+- model_name: fake-openai-endpoint
+  litellm_params:
+    model: predibase/llama-3-8b-instruct
+    api_base: "http://0.0.0.0:8081"
+    api_key: os.environ/PREDIBASE_API_KEY
+    tenant_id: os.environ/PREDIBASE_TENANT_ID
+    max_retries: 0
+    temperature: 0.1
+    max_new_tokens: 256
+    return_full_text: false
+
 # - litellm_params:
 #     api_base: https://my-endpoint-europe-berri-992.openai.azure.com/
 #     api_key: os.environ/AZURE_EUROPE_API_KEY
@ -57,6 +68,8 @@ router_settings:
 litellm_settings:
  success_callback: ["langfuse"]
  cache: True
+  failure_callback: ["langfuse"]
+

 general_settings:
  alerting: ["email"]
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -160,6 +160,7 @@ from litellm.proxy.auth.auth_checks import (
    get_user_object,
    allowed_routes_check,
    get_actual_routes,
+    log_to_opentelemetry,
 )
 from litellm.llms.custom_httpx.httpx_handler import HTTPHandler
 from litellm.exceptions import RejectedRequestError
@ -368,6 +369,11 @@ from typing import Dict
 api_key_header = APIKeyHeader(
    name="Authorization", auto_error=False, description="Bearer token"
 )
+azure_api_key_header = APIKeyHeader(
+    name="API-Key",
+    auto_error=False,
+    description="Some older versions of the openai Python package will send an API-Key header with just the API key ",
+)
 user_api_base = None
 user_model = None
 user_debug = False
@ -508,13 +514,19 @@ async def check_request_disconnection(request: Request, llm_api_call_task):


 async def user_api_key_auth(
-    request: Request, api_key: str = fastapi.Security(api_key_header)
+    request: Request,
+    api_key: str = fastapi.Security(api_key_header),
+    azure_api_key_header: str = fastapi.Security(azure_api_key_header),
 ) -> UserAPIKeyAuth:
    global master_key, prisma_client, llm_model_list, user_custom_auth, custom_db_client, general_settings, proxy_logging_obj
    try:
        if isinstance(api_key, str):
            passed_in_key = api_key
            api_key = _get_bearer_token(api_key=api_key)
+
+        elif isinstance(azure_api_key_header, str):
+            api_key = azure_api_key_header
+
        parent_otel_span: Optional[Span] = None
        if open_telemetry_logger is not None:
            parent_otel_span = open_telemetry_logger.tracer.start_span(
@ -1495,7 +1507,7 @@ async def user_api_key_auth(
                    )
        if valid_token is None:
            # No token was found when looking up in the DB
-            raise Exception("Invalid token passed")
+            raise Exception("Invalid proxy server token passed")
        if valid_token_dict is not None:
            if user_id_information is not None and _is_user_proxy_admin(
                user_id_information
@ -1528,6 +1540,14 @@ async def user_api_key_auth(
                str(e)
            )
        )
+
+        # Log this exception to OTEL
+        if open_telemetry_logger is not None:
+            await open_telemetry_logger.async_post_call_failure_hook(
+                original_exception=e,
+                user_api_key_dict=UserAPIKeyAuth(parent_otel_span=parent_otel_span),
+            )
+
        verbose_proxy_logger.debug(traceback.format_exc())
        if isinstance(e, litellm.BudgetExceededError):
            raise ProxyException(
@ -7803,6 +7823,10 @@ async def get_global_spend_report(
        default=None,
        description="Time till which to view spend",
    ),
+    group_by: Optional[Literal["team", "customer"]] = fastapi.Query(
+        default="team",
+        description="Group spend by internal team or customer",
+    ),
 ):
    """
    Get Daily Spend per Team, based on specific startTime and endTime. Per team, view usage by each key, model
@ -7849,6 +7873,7 @@ async def get_global_spend_report(
                f"Database not connected. Connect a database to your proxy - https://docs.litellm.ai/docs/simple_proxy#managing-auth---virtual-keys"
            )

+        if group_by == "team":
            # first get data from spend logs -> SpendByModelApiKey
            # then read data from "SpendByModelApiKey" to format the response obj
            sql_query = """
@ -7913,6 +7938,66 @@ async def get_global_spend_report(

            return db_response

+        elif group_by == "customer":
+            sql_query = """
+
+            WITH SpendByModelApiKey AS (
+                SELECT
+                    date_trunc('day', sl."startTime") AS group_by_day,
+                    sl.end_user AS customer,
+                    sl.model,
+                    sl.api_key,
+                    SUM(sl.spend) AS model_api_spend,
+                    SUM(sl.total_tokens) AS model_api_tokens
+                FROM
+                    "LiteLLM_SpendLogs" sl
+                WHERE
+                    sl."startTime" BETWEEN $1::date AND $2::date
+                GROUP BY
+                    date_trunc('day', sl."startTime"),
+                    customer,
+                    sl.model,
+                    sl.api_key
+            )
+            SELECT
+                group_by_day,
+                jsonb_agg(jsonb_build_object(
+                    'customer', customer,
+                    'total_spend', total_spend,
+                    'metadata', metadata
+                )) AS customers
+            FROM
+                (
+                    SELECT
+                        group_by_day,
+                        customer,
+                        SUM(model_api_spend) AS total_spend,
+                        jsonb_agg(jsonb_build_object(
+                            'model', model,
+                            'api_key', api_key,
+                            'spend', model_api_spend,
+                            'total_tokens', model_api_tokens
+                        )) AS metadata
+                    FROM
+                        SpendByModelApiKey
+                    GROUP BY
+                        group_by_day,
+                        customer
+                ) AS aggregated
+            GROUP BY
+                group_by_day
+            ORDER BY
+                group_by_day;
+                """
+
+            db_response = await prisma_client.db.query_raw(
+                sql_query, start_date_obj, end_date_obj
+            )
+            if db_response is None:
+                return []
+
+            return db_response
+
    except Exception as e:
        raise HTTPException(
            status_code=status.HTTP_400_BAD_REQUEST,
--- a/litellm/router.py
+++ b/litellm/router.py
@ -2056,6 +2056,9 @@ class Router:
                    verbose_router_logger.debug(f"inside model fallbacks: {fallbacks}")
                    generic_fallback_idx: Optional[int] = None
                    ## check for specific model group-specific fallbacks
+                    if isinstance(fallbacks, list):
+                        fallback_model_group = fallbacks
+                    elif isinstance(fallbacks, dict):
                        for idx, item in enumerate(fallbacks):
                            if list(item.keys())[0] == model_group:
                                fallback_model_group = item[model_group]
@ -2310,6 +2313,9 @@ class Router:
                    verbose_router_logger.debug(f"inside model fallbacks: {fallbacks}")
                    fallback_model_group = None
                    generic_fallback_idx: Optional[int] = None
+                    if isinstance(fallbacks, list):
+                        fallback_model_group = fallbacks
+                    elif isinstance(fallbacks, dict):
                        ## check for specific model group-specific fallbacks
                        for idx, item in enumerate(fallbacks):
                            if list(item.keys())[0] == model_group:
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -345,7 +345,7 @@ def test_completion_claude_3_function_call(model):
            drop_params=True,
        )

-        # Add any assertions, here to check response args
+        # Add any assertions here to check response args
        print(response)
        assert isinstance(response.choices[0].message.tool_calls[0].function.name, str)
        assert isinstance(
@ -530,6 +530,7 @@ def test_completion_cohere_command_r_plus_function_call():
            messages=messages,
            tools=tools,
            tool_choice="auto",
+            force_single_step=True,
        )
        print(second_response)
    except Exception as e:
--- a/litellm/tests/test_completion_cost.py
+++ b/litellm/tests/test_completion_cost.py
@ -517,3 +517,51 @@ def test_groq_response_cost_tracking(is_streaming):
    assert response_cost > 0.0

    print(f"response_cost: {response_cost}")
+
+
+def test_together_ai_qwen_completion_cost():
+    input_kwargs = {
+        "completion_response": litellm.ModelResponse(
+            **{
+                "id": "890db0c33c4ef94b-SJC",
+                "choices": [
+                    {
+                        "finish_reason": "eos",
+                        "index": 0,
+                        "message": {
+                            "content": "I am Qwen, a large language model created by Alibaba Cloud.",
+                            "role": "assistant",
+                        },
+                    }
+                ],
+                "created": 1717900130,
+                "model": "together_ai/qwen/Qwen2-72B-Instruct",
+                "object": "chat.completion",
+                "system_fingerprint": None,
+                "usage": {
+                    "completion_tokens": 15,
+                    "prompt_tokens": 23,
+                    "total_tokens": 38,
+                },
+            }
+        ),
+        "model": "qwen/Qwen2-72B-Instruct",
+        "prompt": "",
+        "messages": [],
+        "completion": "",
+        "total_time": 0.0,
+        "call_type": "completion",
+        "custom_llm_provider": "together_ai",
+        "region_name": None,
+        "size": None,
+        "quality": None,
+        "n": None,
+        "custom_cost_per_token": None,
+        "custom_cost_per_second": None,
+    }
+
+    response = litellm.cost_calculator.get_model_params_and_category(
+        model_name="qwen/Qwen2-72B-Instruct"
+    )
+
+    assert response == "together-ai-41.1b-80b"
--- a/litellm/tests/test_exceptions.py
+++ b/litellm/tests/test_exceptions.py
@ -3,6 +3,7 @@ import os
 import sys
 import traceback
 import subprocess, asyncio
+from typing import Any

 sys.path.insert(
    0, os.path.abspath("../..")
@ -19,6 +20,7 @@ from litellm import (
 )
 from concurrent.futures import ThreadPoolExecutor
 import pytest
+from unittest.mock import patch, MagicMock

 litellm.vertex_project = "pathrise-convert-1606954137718"
 litellm.vertex_location = "us-central1"
@ -655,3 +657,47 @@ def test_litellm_predibase_exception():

 # accuracy_score = counts[True]/(counts[True] + counts[False])
 # print(f"accuracy_score: {accuracy_score}")
+
+
+@pytest.mark.parametrize("provider", ["predibase"])
+def test_exception_mapping(provider):
+    """
+    For predibase, run through a set of mock exceptions
+
+    assert that they are being mapped correctly
+    """
+    litellm.set_verbose = True
+    error_map = {
+        400: litellm.BadRequestError,
+        401: litellm.AuthenticationError,
+        404: litellm.NotFoundError,
+        408: litellm.Timeout,
+        429: litellm.RateLimitError,
+        500: litellm.InternalServerError,
+        503: litellm.ServiceUnavailableError,
+    }
+
+    for code, expected_exception in error_map.items():
+        mock_response = Exception()
+        setattr(mock_response, "text", "This is an error message")
+        setattr(mock_response, "llm_provider", provider)
+        setattr(mock_response, "status_code", code)
+
+        response: Any = None
+        try:
+            response = completion(
+                model="{}/test-model".format(provider),
+                messages=[{"role": "user", "content": "Hey, how's it going?"}],
+                mock_response=mock_response,
+            )
+        except expected_exception:
+            continue
+        except Exception as e:
+            response = "{}\n{}".format(str(e), traceback.format_exc())
+        pytest.fail(
+            "Did not raise expected exception. Expected={}, Return={},".format(
+                expected_exception, response
+            )
+        )
+
+    pass
--- a/litellm/tests/test_key_generate_prisma.py
+++ b/litellm/tests/test_key_generate_prisma.py
@ -272,7 +272,7 @@ def test_call_with_invalid_key(prisma_client):
    except Exception as e:
        print("Got Exception", e)
        print(e.message)
-        assert "Authentication Error, Invalid token passed" in e.message
+        assert "Authentication Error, Invalid proxy server token passed" in e.message
        pass


--- a/litellm/tests/test_router_fallbacks.py
+++ b/litellm/tests/test_router_fallbacks.py
@ -1059,3 +1059,53 @@ async def test_default_model_fallbacks(sync_mode, litellm_module_fallbacks):

    assert isinstance(response, litellm.ModelResponse)
    assert response.model is not None and response.model == "gpt-4o"
+
+
+@pytest.mark.parametrize("sync_mode", [True, False])
+@pytest.mark.asyncio
+async def test_client_side_fallbacks_list(sync_mode):
+    """
+
+    Tests Client Side Fallbacks
+
+    User can pass "fallbacks": ["gpt-3.5-turbo"] and this should work
+
+    """
+    router = Router(
+        model_list=[
+            {
+                "model_name": "bad-model",
+                "litellm_params": {
+                    "model": "openai/my-bad-model",
+                    "api_key": "my-bad-api-key",
+                },
+            },
+            {
+                "model_name": "my-good-model",
+                "litellm_params": {
+                    "model": "gpt-4o",
+                    "api_key": os.getenv("OPENAI_API_KEY"),
+                },
+            },
+        ],
+    )
+
+    if sync_mode:
+        response = router.completion(
+            model="bad-model",
+            messages=[{"role": "user", "content": "Hey, how's it going?"}],
+            fallbacks=["my-good-model"],
+            mock_testing_fallbacks=True,
+            mock_response="Hey! nice day",
+        )
+    else:
+        response = await router.acompletion(
+            model="bad-model",
+            messages=[{"role": "user", "content": "Hey, how's it going?"}],
+            fallbacks=["my-good-model"],
+            mock_testing_fallbacks=True,
+            mock_response="Hey! nice day",
+        )
+
+    assert isinstance(response, litellm.ModelResponse)
+    assert response.model is not None and response.model == "gpt-4o"
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -326,6 +326,22 @@ class Function(OpenAIObject):

        super(Function, self).__init__(**data)

+    def __contains__(self, key):
+        # Define custom behavior for the 'in' operator
+        return hasattr(self, key)
+
+    def get(self, key, default=None):
+        # Custom .get() method to access attributes with a default value if the attribute doesn't exist
+        return getattr(self, key, default)
+
+    def __getitem__(self, key):
+        # Allow dictionary-style access to attributes
+        return getattr(self, key)
+
+    def __setitem__(self, key, value):
+        # Allow dictionary-style assignment of attributes
+        setattr(self, key, value)
+

 class ChatCompletionDeltaToolCall(OpenAIObject):
    id: Optional[str] = None
@ -385,6 +401,22 @@ class ChatCompletionMessageToolCall(OpenAIObject):
        else:
            self.type = "function"

+    def __contains__(self, key):
+        # Define custom behavior for the 'in' operator
+        return hasattr(self, key)
+
+    def get(self, key, default=None):
+        # Custom .get() method to access attributes with a default value if the attribute doesn't exist
+        return getattr(self, key, default)
+
+    def __getitem__(self, key):
+        # Allow dictionary-style access to attributes
+        return getattr(self, key)
+
+    def __setitem__(self, key, value):
+        # Allow dictionary-style assignment of attributes
+        setattr(self, key, value)
+

 class Message(OpenAIObject):
    def __init__(
@ -3929,54 +3961,6 @@ def client(original_function):
        return wrapper


-####### USAGE CALCULATOR ################
-
-
-# Extract the number of billion parameters from the model name
-# only used for together_computer LLMs
-def get_model_params_and_category(model_name):
-    import re
-
-    model_name = model_name.lower()
-    params_match = re.search(
-        r"(\d+b)", model_name
-    )  # catch all decimals like 3b, 70b, etc
-    category = None
-    if params_match != None:
-        params_match = params_match.group(1)
-        params_match = params_match.replace("b", "")
-        params_billion = float(params_match)
-        # Determine the category based on the number of parameters
-        if params_billion <= 3.0:
-            category = "together-ai-up-to-3b"
-        elif params_billion <= 7.0:
-            category = "together-ai-3.1b-7b"
-        elif params_billion <= 20.0:
-            category = "together-ai-7.1b-20b"
-        elif params_billion <= 40.0:
-            category = "together-ai-20.1b-40b"
-        elif params_billion <= 70.0:
-            category = "together-ai-40.1b-70b"
-        return category
-
-    return None
-
-
-def get_replicate_completion_pricing(completion_response=None, total_time=0.0):
-    # see https://replicate.com/pricing
-    a100_40gb_price_per_second_public = 0.001150
-    # for all litellm currently supported LLMs, almost all requests go to a100_80gb
-    a100_80gb_price_per_second_public = (
-        0.001400  # assume all calls sent to A100 80GB for now
-    )
-    if total_time == 0.0:  # total time is in ms
-        start_time = completion_response["created"]
-        end_time = getattr(completion_response, "ended", time.time())
-        total_time = end_time - start_time
-
-    return a100_80gb_price_per_second_public * total_time / 1000
-
-
@lru_cache(maxsize=128)
 def _select_tokenizer(model: str):
    if model in litellm.cohere_models and "command-r" in model:
@ -4363,7 +4347,7 @@ def _cost_per_token_custom_pricing_helper(


 def cost_per_token(
-    model="",
+    model: str = "",
    prompt_tokens=0,
    completion_tokens=0,
    response_time_ms=None,
@ -4388,6 +4372,8 @@ def cost_per_token(
    Returns:
        tuple: A tuple containing the cost in USD dollars for prompt tokens and completion tokens, respectively.
    """
+    if model is None:
+        raise Exception("Invalid arg. Model cannot be none.")
    ## CUSTOM PRICING ##
    response_cost = _cost_per_token_custom_pricing_helper(
        prompt_tokens=prompt_tokens,
@ -4560,213 +4546,6 @@ def cost_per_token(
        )


-def completion_cost(
-    completion_response=None,
-    model=None,
-    prompt="",
-    messages: List = [],
-    completion="",
-    total_time=0.0,  # used for replicate, sagemaker
-    call_type: Literal[
-        "embedding",
-        "aembedding",
-        "completion",
-        "acompletion",
-        "atext_completion",
-        "text_completion",
-        "image_generation",
-        "aimage_generation",
-        "moderation",
-        "amoderation",
-        "atranscription",
-        "transcription",
-        "aspeech",
-        "speech",
-    ] = "completion",
-    ### REGION ###
-    custom_llm_provider=None,
-    region_name=None,  # used for bedrock pricing
-    ### IMAGE GEN ###
-    size=None,
-    quality=None,
-    n=None,  # number of images
-    ### CUSTOM PRICING ###
-    custom_cost_per_token: Optional[CostPerToken] = None,
-    custom_cost_per_second: Optional[float] = None,
-) -> float:
-    """
-    Calculate the cost of a given completion call fot GPT-3.5-turbo, llama2, any litellm supported llm.
-
-    Parameters:
-        completion_response (litellm.ModelResponses): [Required] The response received from a LiteLLM completion request.
-
-        [OPTIONAL PARAMS]
-        model (str): Optional. The name of the language model used in the completion calls
-        prompt (str): Optional. The input prompt passed to the llm
-        completion (str): Optional. The output completion text from the llm
-        total_time (float): Optional. (Only used for Replicate LLMs) The total time used for the request in seconds
-        custom_cost_per_token: Optional[CostPerToken]: the cost per input + output token for the llm api call.
-        custom_cost_per_second: Optional[float]: the cost per second for the llm api call.
-
-    Returns:
-        float: The cost in USD dollars for the completion based on the provided parameters.
-
-    Exceptions:
-        Raises exception if model not in the litellm model cost map. Register model, via custom pricing or PR - https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json
-
-
-    Note:
-        - If completion_response is provided, the function extracts token information and the model name from it.
-        - If completion_response is not provided, the function calculates token counts based on the model and input text.
-        - The cost is calculated based on the model, prompt tokens, and completion tokens.
-        - For certain models containing "togethercomputer" in the name, prices are based on the model size.
-        - For un-mapped Replicate models, the cost is calculated based on the total time used for the request.
-    """
-    try:
-        if (
-            (call_type == "aimage_generation" or call_type == "image_generation")
-            and model is not None
-            and isinstance(model, str)
-            and len(model) == 0
-            and custom_llm_provider == "azure"
-        ):
-            model = "dall-e-2"  # for dall-e-2, azure expects an empty model name
-        # Handle Inputs to completion_cost
-        prompt_tokens = 0
-        completion_tokens = 0
-        custom_llm_provider = None
-        if completion_response is not None:
-            # get input/output tokens from completion_response
-            prompt_tokens = completion_response.get("usage", {}).get("prompt_tokens", 0)
-            completion_tokens = completion_response.get("usage", {}).get(
-                "completion_tokens", 0
-            )
-            total_time = completion_response.get("_response_ms", 0)
-            verbose_logger.debug(
-                f"completion_response response ms: {completion_response.get('_response_ms')} "
-            )
-            model = model or completion_response.get(
-                "model", None
-            )  # check if user passed an override for model, if it's none check completion_response['model']
-            if hasattr(completion_response, "_hidden_params"):
-                if (
-                    completion_response._hidden_params.get("model", None) is not None
-                    and len(completion_response._hidden_params["model"]) > 0
-                ):
-                    model = completion_response._hidden_params.get("model", model)
-                custom_llm_provider = completion_response._hidden_params.get(
-                    "custom_llm_provider", ""
-                )
-                region_name = completion_response._hidden_params.get(
-                    "region_name", region_name
-                )
-                size = completion_response._hidden_params.get(
-                    "optional_params", {}
-                ).get(
-                    "size", "1024-x-1024"
-                )  # openai default
-                quality = completion_response._hidden_params.get(
-                    "optional_params", {}
-                ).get(
-                    "quality", "standard"
-                )  # openai default
-                n = completion_response._hidden_params.get("optional_params", {}).get(
-                    "n", 1
-                )  # openai default
-        else:
-            if len(messages) > 0:
-                prompt_tokens = token_counter(model=model, messages=messages)
-            elif len(prompt) > 0:
-                prompt_tokens = token_counter(model=model, text=prompt)
-            completion_tokens = token_counter(model=model, text=completion)
-        if model == None:
-            raise ValueError(
-                f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}"
-            )
-
-        if (
-            call_type == CallTypes.image_generation.value
-            or call_type == CallTypes.aimage_generation.value
-        ):
-            ### IMAGE GENERATION COST CALCULATION ###
-            if custom_llm_provider == "vertex_ai":
-                # https://cloud.google.com/vertex-ai/generative-ai/pricing
-                # Vertex Charges Flat $0.20 per image
-                return 0.020
-
-            # fix size to match naming convention
-            if "x" in size and "-x-" not in size:
-                size = size.replace("x", "-x-")
-            image_gen_model_name = f"{size}/{model}"
-            image_gen_model_name_with_quality = image_gen_model_name
-            if quality is not None:
-                image_gen_model_name_with_quality = f"{quality}/{image_gen_model_name}"
-            size = size.split("-x-")
-            height = int(size[0])  # if it's 1024-x-1024 vs. 1024x1024
-            width = int(size[1])
-            verbose_logger.debug(f"image_gen_model_name: {image_gen_model_name}")
-            verbose_logger.debug(
-                f"image_gen_model_name_with_quality: {image_gen_model_name_with_quality}"
-            )
-            if image_gen_model_name in litellm.model_cost:
-                return (
-                    litellm.model_cost[image_gen_model_name]["input_cost_per_pixel"]
-                    * height
-                    * width
-                    * n
-                )
-            elif image_gen_model_name_with_quality in litellm.model_cost:
-                return (
-                    litellm.model_cost[image_gen_model_name_with_quality][
-                        "input_cost_per_pixel"
-                    ]
-                    * height
-                    * width
-                    * n
-                )
-            else:
-                raise Exception(
-                    f"Model={image_gen_model_name} not found in completion cost model map"
-                )
-        # Calculate cost based on prompt_tokens, completion_tokens
-        if (
-            "togethercomputer" in model
-            or "together_ai" in model
-            or custom_llm_provider == "together_ai"
-        ):
-            # together ai prices based on size of llm
-            # get_model_params_and_category takes a model name and returns the category of LLM size it is in model_prices_and_context_window.json
-            model = get_model_params_and_category(model)
-        # replicate llms are calculate based on time for request running
-        # see https://replicate.com/pricing
-        elif (
-            model in litellm.replicate_models or "replicate" in model
-        ) and model not in litellm.model_cost:
-            # for unmapped replicate model, default to replicate's time tracking logic
-            return get_replicate_completion_pricing(completion_response, total_time)
-
-        (
-            prompt_tokens_cost_usd_dollar,
-            completion_tokens_cost_usd_dollar,
-        ) = cost_per_token(
-            model=model,
-            prompt_tokens=prompt_tokens,
-            completion_tokens=completion_tokens,
-            custom_llm_provider=custom_llm_provider,
-            response_time_ms=total_time,
-            region_name=region_name,
-            custom_cost_per_second=custom_cost_per_second,
-            custom_cost_per_token=custom_cost_per_token,
-        )
-        _final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
-        print_verbose(
-            f"final cost: {_final_cost}; prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}; completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}"
-        )
-        return _final_cost
-    except Exception as e:
-        raise e
-
-
 def supports_httpx_timeout(custom_llm_provider: str) -> bool:
    """
    Helper function to know if a provider implementation supports httpx timeout
@ -8986,6 +8765,75 @@ def exception_type(
                        response=original_exception.response,
                        litellm_debug_info=extra_information,
                    )
+                elif hasattr(original_exception, "status_code"):
+                    if original_exception.status_code == 500:
+                        exception_mapping_worked = True
+                        raise litellm.InternalServerError(
+                            message=f"PredibaseException - {original_exception.message}",
+                            llm_provider="predibase",
+                            model=model,
+                        )
+                    elif original_exception.status_code == 401:
+                        exception_mapping_worked = True
+                        raise AuthenticationError(
+                            message=f"PredibaseException - {original_exception.message}",
+                            llm_provider="predibase",
+                            model=model,
+                        )
+                    elif original_exception.status_code == 400:
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"PredibaseException - {original_exception.message}",
+                            llm_provider="predibase",
+                            model=model,
+                        )
+                    elif original_exception.status_code == 404:
+                        exception_mapping_worked = True
+                        raise NotFoundError(
+                            message=f"PredibaseException - {original_exception.message}",
+                            llm_provider="predibase",
+                            model=model,
+                        )
+                    elif original_exception.status_code == 408:
+                        exception_mapping_worked = True
+                        raise Timeout(
+                            message=f"PredibaseException - {original_exception.message}",
+                            model=model,
+                            llm_provider=custom_llm_provider,
+                            litellm_debug_info=extra_information,
+                        )
+                    elif original_exception.status_code == 422:
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"PredibaseException - {original_exception.message}",
+                            model=model,
+                            llm_provider=custom_llm_provider,
+                            litellm_debug_info=extra_information,
+                        )
+                    elif original_exception.status_code == 429:
+                        exception_mapping_worked = True
+                        raise RateLimitError(
+                            message=f"PredibaseException - {original_exception.message}",
+                            model=model,
+                            llm_provider=custom_llm_provider,
+                            litellm_debug_info=extra_information,
+                        )
+                    elif original_exception.status_code == 503:
+                        exception_mapping_worked = True
+                        raise ServiceUnavailableError(
+                            message=f"PredibaseException - {original_exception.message}",
+                            model=model,
+                            llm_provider=custom_llm_provider,
+                            litellm_debug_info=extra_information,
+                        )
+                    elif original_exception.status_code == 504:  # gateway timeout error
+                        exception_mapping_worked = True
+                        raise Timeout(
+                            message=f"PredibaseException - {original_exception.message}",
+                            model=model,
+                            llm_provider=custom_llm_provider,
+                            litellm_debug_info=extra_information,
+                        )
            elif custom_llm_provider == "bedrock":
                if (
                    "too many tokens" in error_str
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -3009,32 +3009,37 @@
        "litellm_provider": "sagemaker",
        "mode": "chat"
    },
-    "together-ai-up-to-3b": {
+    "together-ai-up-to-4b": {
        "input_cost_per_token": 0.0000001,
        "output_cost_per_token": 0.0000001,
        "litellm_provider": "together_ai"
    },
-    "together-ai-3.1b-7b": {
+    "together-ai-4.1b-8b": {
        "input_cost_per_token": 0.0000002,
        "output_cost_per_token": 0.0000002,
        "litellm_provider": "together_ai"
    },
-    "together-ai-7.1b-20b": {
+    "together-ai-8.1b-21b": {
        "max_tokens": 1000,
-        "input_cost_per_token": 0.0000004,
-        "output_cost_per_token": 0.0000004,
+        "input_cost_per_token": 0.0000003,
+        "output_cost_per_token": 0.0000003,
        "litellm_provider": "together_ai"
    },
-    "together-ai-20.1b-40b": {
+    "together-ai-21.1b-41b": {
        "input_cost_per_token": 0.0000008,
        "output_cost_per_token": 0.0000008,
        "litellm_provider": "together_ai"
    },
-    "together-ai-40.1b-70b": {
+    "together-ai-41.1b-80b": {
        "input_cost_per_token": 0.0000009,
        "output_cost_per_token": 0.0000009,
        "litellm_provider": "together_ai"
    },
+    "together-ai-81.1b-110b": {
+        "input_cost_per_token": 0.0000018,
+        "output_cost_per_token": 0.0000018,
+        "litellm_provider": "together_ai"
+    },
    "together_ai/mistralai/Mixtral-8x7B-Instruct-v0.1": {
        "input_cost_per_token": 0.0000006,
        "output_cost_per_token": 0.0000006,
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.40.7"
+version = "1.40.8"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@ -84,7 +84,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"

 [tool.commitizen]
-version = "1.40.7"
+version = "1.40.8"
 version_files = [
    "pyproject.toml:^version"
 ]
--- a/ruff.toml
+++ b/ruff.toml
@ -1,3 +1,3 @@
-ignore = ["F405"]
+ignore = ["F405", "E402"]
 extend-select = ["E501"]
 line-length = 120
--- a/ui/litellm-dashboard/src/components/chat_ui.tsx
+++ b/ui/litellm-dashboard/src/components/chat_ui.tsx
@ -120,8 +120,23 @@ const ChatUI: React.FC<ChatUIProps> = ({
          // Now, 'options' contains the list you wanted
          console.log(options); // You can log it to verify the list

+          // if options.length > 0, only store unique values
+          if (options.length > 0) {
+            const uniqueModels = Array.from(new Set(options));
+
+            console.log("Unique models:", uniqueModels);
+
+            // sort uniqueModels alphabetically
+            uniqueModels.sort((a: any, b: any) => a.label.localeCompare(b.label));
+
+
+            console.log("Model info:", modelInfo);
+            
            // setModelInfo(options) should be inside the if block to avoid setting it when no data is available
-          setModelInfo(options);
+            setModelInfo(uniqueModels);
+          }
+
+
          setSelectedModel(fetchedAvailableModels.data[0].id);
        }
      } catch (error) {
--- a/ui/litellm-dashboard/src/components/model_dashboard.tsx
+++ b/ui/litellm-dashboard/src/components/model_dashboard.tsx
@ -1130,7 +1130,7 @@ const ModelDashboard: React.FC<ModelDashboardProps> = ({
                              setSelectedAPIKey(key);
                            }}
                          >
-                            ✨ {key["key_alias"]} (Enterpise only Feature) 
+                            ✨ {key["key_alias"]} (Enterprise only Feature)
                          </SelectItem>
                        );
                      }
@ -1165,7 +1165,7 @@ const ModelDashboard: React.FC<ModelDashboardProps> = ({
                      setSelectedCustomer(user);
                    }}
                  >
-                    ✨ {user} (Enterpise only Feature) 
+                    ✨ {user} (Enterprise only Feature)
                  </SelectItem>
                );
              })
--- a/ui/litellm-dashboard/src/components/navbar.tsx
+++ b/ui/litellm-dashboard/src/components/navbar.tsx
@ -114,7 +114,7 @@ const Navbar: React.FC<NavbarProps> = ({
                textDecoration: "underline",
              }}
            >
-              Get enterpise license
+              Get enterprise license
            </a>
          </div>
        ) : null}
--- a/ui/litellm-dashboard/src/components/usage.tsx
+++ b/ui/litellm-dashboard/src/components/usage.tsx
@ -832,7 +832,7 @@ const UsagePage: React.FC<UsagePageProps> = ({
                                  // @ts-ignore
                                  disabled={true} 
                                >
-                                  ✨ {tag} (Enterpise only Feature)
+                                  ✨ {tag} (Enterprise only Feature)
                                </SelectItem>
                              );
                            })}