Merge branch 'main' into litellm_aws_kms_fixes

2024-06-10 20:17:34 -07:00 · 2024-06-10 20:17:34 -07:00 · 4475d2e5b5
commit 4475d2e5b5
parent e6c96aa950 8d18583c67
34 changed files with 1293 additions and 483 deletions
--- a/docs/my-website/docs/assistants.md
+++ b/docs/my-website/docs/assistants.md
@ -150,7 +150,7 @@ $ litellm --config /path/to/config.yaml
 ```bash
 curl "http://0.0.0.0:4000/v1/assistants?order=desc&limit=20" \
  -H "Content-Type: application/json" \
-  -H "Authorization: Bearer sk-1234" \
+  -H "Authorization: Bearer sk-1234"
 ```
 **Create a Thread**
@ -162,6 +162,14 @@ curl http://0.0.0.0:4000/v1/threads \
  -d ''
 ```
 **Get a Thread**
 ```bash
 curl http://0.0.0.0:4000/v1/threads/{thread_id} \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer sk-1234"
 ```
 **Add Messages to the Thread**
 ```bash
--- a/docs/my-website/docs/caching/all_caches.md
+++ b/docs/my-website/docs/caching/all_caches.md
@ -212,6 +212,94 @@ If you run the code two times, response1 will use the cache from the first run t
 </TabItem>
 </Tabs>
 ## Switch Cache On / Off Per LiteLLM Call 
 LiteLLM supports 4 cache-controls:
 - `no-cache`: *Optional(bool)* When `True`, Will not return a cached response, but instead call the actual endpoint. 
 - `no-store`: *Optional(bool)* When `True`, Will not cache the response. 
 - `ttl`: *Optional(int)* - Will cache the response for the user-defined amount of time (in seconds).
 - `s-maxage`: *Optional(int)* Will only accept cached responses that are within user-defined range (in seconds).
 [Let us know if you need more](https://github.com/BerriAI/litellm/issues/1218)
 <Tabs>
 <TabItem value="no-cache" label="No-Cache">
 Example usage `no-cache` - When `True`, Will not return a cached response
 ```python
 response = litellm.completion(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "user",
                "content": "hello who are you"
            }
        ],
        cache={"no-cache": True},
    )
 ```
 </TabItem>
 <TabItem value="no-store" label="No-Store">
 Example usage `no-store` - When `True`, Will not cache the response. 
 ```python
 response = litellm.completion(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "user",
                "content": "hello who are you"
            }
        ],
        cache={"no-store": True},
    )
 ```
 </TabItem>
 <TabItem value="ttl" label="ttl">
 Example usage `ttl` - cache the response for 10 seconds
 ```python
 response = litellm.completion(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "user",
                "content": "hello who are you"
            }
        ],
        cache={"ttl": 10},
    )
 ```
 </TabItem>
 <TabItem value="s-maxage" label="s-maxage">
 Example usage `s-maxage` - Will only accept cached responses for 60 seconds
 ```python
 response = litellm.completion(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "user",
                "content": "hello who are you"
            }
        ],
        cache={"s-maxage": 60},
    )
 ```
 </TabItem>
 </Tabs>
 ## Cache Context Manager - Enable, Disable, Update Cache
--- a/docs/my-website/docs/projects/llm_cord.md
+++ b/docs/my-website/docs/projects/llm_cord.md
@ -1,3 +1,5 @@
 # llmcord.py
 llmcord.py lets you and your friends chat with LLMs directly in your Discord server. It works with practically any LLM, remote or locally hosted.
 Github: https://github.com/jakobdylanc/discord-llm-chatbot
--- a/docs/my-website/docs/proxy/cost_tracking.md
+++ b/docs/my-website/docs/proxy/cost_tracking.md
@ -138,14 +138,22 @@ Navigate to the Usage Tab on the LiteLLM UI (found on https://your-proxy-endpoin
 <Image img={require('../../img/admin_ui_spend.png')} />
 ## API Endpoints to get Spend
-#### Getting Spend Reports - To Charge Other Teams, API Keys
+#### Getting Spend Reports - To Charge Other Teams, Customers
-Use the `/global/spend/report` endpoint to get daily spend per team, with a breakdown of spend per API Key, Model
+Use the `/global/spend/report` endpoint to get daily spend report per 
 - team
 - customer [this is `user` passed to `/chat/completions` request](#how-to-track-spend-with-litellm)
 <Tabs>
 <TabItem value="per team" label="Spend Per Team">
 ##### Example Request
 👉 Key Change: Specify `group_by=team`
 ```shell
-curl -X GET 'http://localhost:4000/global/spend/report?start_date=2024-04-01&end_date=2024-06-30' \
+curl -X GET 'http://localhost:4000/global/spend/report?start_date=2024-04-01&end_date=2024-06-30&group_by=team' \
  -H 'Authorization: Bearer sk-1234'
 ```
@ -254,6 +262,69 @@ Output from script
 ```
 </TabItem>
 </Tabs>
 </TabItem>
 <TabItem value="per customer" label="Spend Per Customer">
 ##### Example Request
 👉 Key Change: Specify `group_by=customer`
 ```shell
 curl -X GET 'http://localhost:4000/global/spend/report?start_date=2024-04-01&end_date=2024-06-30&group_by=customer' \
  -H 'Authorization: Bearer sk-1234'
 ```
 ##### Example Response
 ```shell
 [
    {
        "group_by_day": "2024-04-30T00:00:00+00:00",
        "customers": [
            {
                "customer": "palantir",
                "total_spend": 0.0015265,
                "metadata": [ # see the spend by unique(key + model)
                    {
                        "model": "gpt-4",
                        "spend": 0.00123,
                        "total_tokens": 28,
                        "api_key": "88dc28.." # the hashed api key
                    },
                    {
                        "model": "gpt-4",
                        "spend": 0.00123,
                        "total_tokens": 28,
                        "api_key": "a73dc2.." # the hashed api key
                    },
                    {
                        "model": "chatgpt-v-2",
                        "spend": 0.000214,
                        "total_tokens": 122,
                        "api_key": "898c28.." # the hashed api key
                    },
                    {
                        "model": "gpt-3.5-turbo",
                        "spend": 0.0000825,
                        "total_tokens": 85,
                        "api_key": "84dc28.." # the hashed api key
                    }
                ]
            }
        ]
    }
 ]
 ```
 </TabItem>
 </Tabs>
--- a/docs/my-website/docs/proxy/debugging.md
+++ b/docs/my-website/docs/proxy/debugging.md
@ -42,6 +42,14 @@ Set `JSON_LOGS="True"` in your env:
 ```bash
 export JSON_LOGS="True"
 ```
 **OR**
 Set `json_logs: true` in your yaml: 
 ```yaml
 litellm_settings:
    json_logs: true
 ```
 Start proxy 
@ -49,4 +57,35 @@ Start proxy
 $ litellm
 ```
-The proxy will now all logs in json format.
+The proxy will now all logs in json format.
 ## Control Log Output 
 Turn off fastapi's default 'INFO' logs 
 1. Turn on 'json logs' 
 ```yaml
 litellm_settings:
    json_logs: true
 ```
 2. Set `LITELLM_LOG` to 'ERROR' 
 Only get logs if an error occurs. 
 ```bash
 LITELLM_LOG="ERROR"
 ```
 3. Start proxy 
 ```bash
 $ litellm
 ```
 Expected Output: 
 ```bash
 # no info statements
 ```
--- a/docs/my-website/docs/proxy/multiple_admins.md
+++ b/docs/my-website/docs/proxy/multiple_admins.md
@ -2,11 +2,21 @@
 Call management endpoints on behalf of a user. (Useful when connecting proxy to your development platform).
 :::info 
 Requires Enterprise License for usage.
 ::: 
-## Set `LiteLLM-Changed-By` in request headers
+:::tip
 Requires Enterprise License, Get in touch with us [here](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
 :::
 ## 1. Switch on audit Logs 
 Add `store_audit_logs` to your litellm config.yaml and then start the proxy
 ```shell
 litellm_settings:
  store_audit_logs: true
 ```
 ## 2. Set `LiteLLM-Changed-By` in request headers
 Set the 'user_id' in request headers, when calling a management endpoint. [View Full List](https://litellm-api.up.railway.app/#/team%20management).
@ -26,7 +36,7 @@ curl -X POST 'http://0.0.0.0:4000/team/update' \
    }'
 ```
-## Emitted Audit Log 
+## 3. Emitted Audit Log 
 ```bash
 {
--- a/docs/my-website/docs/proxy/prod.md
+++ b/docs/my-website/docs/proxy/prod.md
@ -21,6 +21,7 @@ general_settings:
 litellm_settings:
  set_verbose: False      # Switch off Debug Logging, ensure your logs do not have any debugging on
  json_logs: true         # Get debug logs in json format
 ```
 Set slack webhook url in your env
@ -28,6 +29,11 @@ Set slack webhook url in your env
 export SLACK_WEBHOOK_URL="https://hooks.slack.com/services/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"
 ```
 Turn off FASTAPI's default info logs
 ```bash
 export LITELLM_LOG="ERROR"
 ```
 :::info
 Need Help or want dedicated support ? Talk to a founder [here]: (https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
--- a/docs/my-website/docs/proxy/reliability.md
+++ b/docs/my-website/docs/proxy/reliability.md
@ -2,18 +2,13 @@ import Image from '@theme/IdealImage';
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
-# 🔥 Fallbacks, Retries, Timeouts, Load Balancing
+# 🔥 Load Balancing, Fallbacks, Retries, Timeouts
-Retry call with multiple instances of the same model.
+- Quick Start [load balancing](#test---load-balancing)
-
+- Quick Start [client side fallbacks](#test---client-side-fallbacks)
 If a call fails after num_retries, fall back to another model group.
 If the error is a context window exceeded error, fall back to a larger model group (if given).
 [**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/router.py)
 ## Quick Start - Load Balancing
-### Step 1 - Set deployments on config
+#### Step 1 - Set deployments on config
 **Example config below**. Here requests with `model=gpt-3.5-turbo` will be routed across multiple instances of `azure/gpt-3.5-turbo`
 ```yaml
@ -38,50 +33,220 @@ model_list:
      rpm: 1440
 ```
-### Step 2: Start Proxy with config
+#### Step 2: Start Proxy with config
 ```shell
 $ litellm --config /path/to/config.yaml
 ```
-### Step 3: Use proxy - Call a model group [Load Balancing]
+### Test - Load Balancing
-Curl Command
+
 Here requests with model=gpt-3.5-turbo will be routed across multiple instances of azure/gpt-3.5-turbo
 👉 Key Change: `model="gpt-3.5-turbo"`
 **Check the `model_id` in Response Headers to make sure the requests are being load balanced**
 <Tabs>
 <TabItem value="openai" label="OpenAI Python v1.0.0+">
 ```python
 import openai
 client = openai.OpenAI(
    api_key="anything",
    base_url="http://0.0.0.0:4000"
 )
 response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages = [
        {
            "role": "user",
            "content": "this is a test request, write a short poem"
        }
    ]
 )
 print(response)
 ```
 </TabItem>
 <TabItem value="Curl" label="Curl Request">
 Pass `metadata` as part of the request body
 ```shell
 curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
+    --header 'Content-Type: application/json' \
--data ' {
+    --data '{
-      "model": "gpt-3.5-turbo",
+    "model": "gpt-3.5-turbo",
-      "messages": [
+    "messages": [
        {
-          "role": "user",
+        "role": "user",
-          "content": "what llm are you"
+        "content": "what llm are you"
        }
-      ],
+    ]
-    }
+}'
-'
+```
 </TabItem>
 <TabItem value="langchain" label="Langchain">
 ```python
 from langchain.chat_models import ChatOpenAI
 from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
 )
 from langchain.schema import HumanMessage, SystemMessage
 import os 
 os.environ["OPENAI_API_KEY"] = "anything"
 chat = ChatOpenAI(
    openai_api_base="http://0.0.0.0:4000",
    model="gpt-3.5-turbo",
 )
 messages = [
    SystemMessage(
        content="You are a helpful assistant that im using to make a test request to."
    ),
    HumanMessage(
        content="test from litellm. tell me why it's amazing in 1 sentence"
    ),
 ]
 response = chat(messages)
 print(response)
 ```
-### Usage - Call a specific model deployment
+</TabItem>
-If you want to call a specific model defined in the `config.yaml`, you can call the `litellm_params: model`
+
 </Tabs>
 ### Test - Client Side Fallbacks
 In this request the following will occur:
 1. The request to `model="zephyr-beta"` will fail
 2. litellm proxy will loop through all the model_groups specified in `fallbacks=["gpt-3.5-turbo"]`
 3. The request to `model="gpt-3.5-turbo"` will succeed and the client making the request will get a response from gpt-3.5-turbo 
 👉 Key Change: `"fallbacks": ["gpt-3.5-turbo"]`
 <Tabs>
 <TabItem value="openai" label="OpenAI Python v1.0.0+">
 ```python
 import openai
 client = openai.OpenAI(
    api_key="anything",
    base_url="http://0.0.0.0:4000"
 )
 response = client.chat.completions.create(
    model="zephyr-beta",
    messages = [
        {
            "role": "user",
            "content": "this is a test request, write a short poem"
        }
    ],
    extra_body={
        "metadata": {
            "fallbacks": ["gpt-3.5-turbo"]
        }
    }
 )
 print(response)
 ```
 </TabItem>
 <TabItem value="Curl" label="Curl Request">
 Pass `metadata` as part of the request body
 ```shell
 curl --location 'http://0.0.0.0:4000/chat/completions' \
    --header 'Content-Type: application/json' \
    --data '{
    "model": "zephyr-beta"",
    "messages": [
        {
        "role": "user",
        "content": "what llm are you"
        }
    ],
    "metadata": {
        "fallbacks": ["gpt-3.5-turbo"]
    }
 }'
 ```
 </TabItem>
 <TabItem value="langchain" label="Langchain">
 ```python
 from langchain.chat_models import ChatOpenAI
 from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
 )
 from langchain.schema import HumanMessage, SystemMessage
 import os 
 os.environ["OPENAI_API_KEY"] = "anything"
 chat = ChatOpenAI(
    openai_api_base="http://0.0.0.0:4000",
    model="zephyr-beta",
    extra_body={
        "metadata": {
            "fallbacks": ["gpt-3.5-turbo"]
        }
    }
 )
 messages = [
    SystemMessage(
        content="You are a helpful assistant that im using to make a test request to."
    ),
    HumanMessage(
        content="test from litellm. tell me why it's amazing in 1 sentence"
    ),
 ]
 response = chat(messages)
 print(response)
 ```
 </TabItem>
 </Tabs>
 <!-- 
 ### Test it!
 In this example it will call `azure/gpt-turbo-small-ca`. Defined in the config on Step 1
 ```bash
 curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
+     --header 'Content-Type: application/json' \
--data ' {
+     --data-raw '{
-      "model": "azure/gpt-turbo-small-ca",
+        "model": "zephyr-beta", # 👈 MODEL NAME to fallback from
-      "messages": [
+        "messages": [
-        {
+            {"role": "user", "content": "what color is red"}
-          "role": "user",
+        ],
-          "content": "what llm are you"
+        "mock_testing_fallbacks": true
-        }
+     }'
-      ],
+``` -->
    }
 '
 ```
-## Fallbacks + Retries + Timeouts + Cooldowns
+## Advanced
 ### Fallbacks + Retries + Timeouts + Cooldowns
 **Set via config**
 ```yaml
@ -114,44 +279,7 @@ litellm_settings:
  context_window_fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}] # fallback to gpt-3.5-turbo-16k if context window error
  allowed_fails: 3 # cooldown model if it fails > 1 call in a minute. 
 ```
-
+### Context Window Fallbacks (Pre-Call Checks + Fallbacks)
 **Set dynamically**
 ```bash
 curl --location 'http://0.0.0.0:4000/chat/completions' \
 --header 'Content-Type: application/json' \
 --data ' {
      "model": "zephyr-beta",
      "messages": [
        {
          "role": "user",
          "content": "what llm are you"
        }
      ],
      "fallbacks": [{"zephyr-beta": ["gpt-3.5-turbo"]}],
      "context_window_fallbacks": [{"zephyr-beta": ["gpt-3.5-turbo"]}],
      "num_retries": 2,
      "timeout": 10
    }
 '
 ```
 ### Test it!
 ```bash
 curl --location 'http://0.0.0.0:4000/chat/completions' \
     --header 'Content-Type: application/json' \
     --data-raw '{
        "model": "zephyr-beta", # 👈 MODEL NAME to fallback from
        "messages": [
            {"role": "user", "content": "what color is red"}
        ],
        "mock_testing_fallbacks": true
     }'
 ```
 ## Advanced - Context Window Fallbacks (Pre-Call Checks + Fallbacks)
 **Before call is made** check if a call is within model context window with  **`enable_pre_call_checks: true`**.
@ -287,7 +415,7 @@ print(response)
 </Tabs>
-## Advanced - EU-Region Filtering (Pre-Call Checks)
+### EU-Region Filtering (Pre-Call Checks)
 **Before call is made** check if a call is within model context window with  **`enable_pre_call_checks: true`**.
@ -350,7 +478,7 @@ print(response)
 print(f"response.headers.get('x-litellm-model-api-base')")
 ```
-## Advanced - Custom Timeouts, Stream Timeouts - Per Model
+### Custom Timeouts, Stream Timeouts - Per Model
 For each model you can set `timeout` & `stream_timeout` under `litellm_params`
 ```yaml
 model_list:
@ -379,7 +507,7 @@ $ litellm --config /path/to/config.yaml
 ```
-## Advanced - Setting Dynamic Timeouts - Per Request
+### Setting Dynamic Timeouts - Per Request
 LiteLLM Proxy supports setting a `timeout` per request 
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -255,6 +255,7 @@ const sidebars = {
            "projects/GPT Migrate",
            "projects/YiVal",
            "projects/LiteLLM Proxy",
            "projects/llm_cord",
          ],
        },
      ],
--- a/litellm/init.py
+++ b/litellm/init.py
@ -709,6 +709,7 @@ all_embedding_models = (
 openai_image_generation_models = ["dall-e-2", "dall-e-3"]
 from .timeout import timeout
 from .cost_calculator import completion_cost
 from .utils import (
    client,
    exception_type,
@ -718,7 +719,6 @@ from .utils import (
    create_pretrained_tokenizer,
    create_tokenizer,
    cost_per_token,
    completion_cost,
    supports_function_calling,
    supports_parallel_function_calling,
    supports_vision,
--- a/litellm/cost_calculator.py
+++ b/litellm/cost_calculator.py
@ -1,6 +1,7 @@
 # What is this?
 ## File for 'response_cost' calculation in Logging
-from typing import Optional, Union, Literal
+from typing import Optional, Union, Literal, List
 import litellm._logging
 from litellm.utils import (
    ModelResponse,
    EmbeddingResponse,
@ -8,10 +9,281 @@ from litellm.utils import (
    TranscriptionResponse,
    TextCompletionResponse,
    CallTypes,
-    completion_cost,
+    cost_per_token,
    print_verbose,
    CostPerToken,
    token_counter,
 )
 import litellm
 from litellm import verbose_logger
 # Extract the number of billion parameters from the model name
 # only used for together_computer LLMs
 def get_model_params_and_category(model_name) -> str:
    """
    Helper function for calculating together ai pricing.
    Returns
    - str - model pricing category if mapped else received model name
    """
    import re
    model_name = model_name.lower()
    re_params_match = re.search(
        r"(\d+b)", model_name
    )  # catch all decimals like 3b, 70b, etc
    category = None
    if re_params_match is not None:
        params_match = str(re_params_match.group(1))
        params_match = params_match.replace("b", "")
        if params_match is not None:
            params_billion = float(params_match)
        else:
            return model_name
        # Determine the category based on the number of parameters
        if params_billion <= 4.0:
            category = "together-ai-up-to-4b"
        elif params_billion <= 8.0:
            category = "together-ai-4.1b-8b"
        elif params_billion <= 21.0:
            category = "together-ai-8.1b-21b"
        elif params_billion <= 41.0:
            category = "together-ai-21.1b-41b"
        elif params_billion <= 80.0:
            category = "together-ai-41.1b-80b"
        elif params_billion <= 110.0:
            category = "together-ai-81.1b-110b"
        if category is not None:
            return category
    return model_name
 def get_replicate_completion_pricing(completion_response=None, total_time=0.0):
    # see https://replicate.com/pricing
    # for all litellm currently supported LLMs, almost all requests go to a100_80gb
    a100_80gb_price_per_second_public = (
        0.001400  # assume all calls sent to A100 80GB for now
    )
    if total_time == 0.0:  # total time is in ms
        start_time = completion_response["created"]
        end_time = getattr(completion_response, "ended", time.time())
        total_time = end_time - start_time
    return a100_80gb_price_per_second_public * total_time / 1000
 def completion_cost(
    completion_response=None,
    model: Optional[str] = None,
    prompt="",
    messages: List = [],
    completion="",
    total_time=0.0,  # used for replicate, sagemaker
    call_type: Literal[
        "embedding",
        "aembedding",
        "completion",
        "acompletion",
        "atext_completion",
        "text_completion",
        "image_generation",
        "aimage_generation",
        "moderation",
        "amoderation",
        "atranscription",
        "transcription",
        "aspeech",
        "speech",
    ] = "completion",
    ### REGION ###
    custom_llm_provider=None,
    region_name=None,  # used for bedrock pricing
    ### IMAGE GEN ###
    size=None,
    quality=None,
    n=None,  # number of images
    ### CUSTOM PRICING ###
    custom_cost_per_token: Optional[CostPerToken] = None,
    custom_cost_per_second: Optional[float] = None,
 ) -> float:
    """
    Calculate the cost of a given completion call fot GPT-3.5-turbo, llama2, any litellm supported llm.
    Parameters:
        completion_response (litellm.ModelResponses): [Required] The response received from a LiteLLM completion request.
        [OPTIONAL PARAMS]
        model (str): Optional. The name of the language model used in the completion calls
        prompt (str): Optional. The input prompt passed to the llm
        completion (str): Optional. The output completion text from the llm
        total_time (float): Optional. (Only used for Replicate LLMs) The total time used for the request in seconds
        custom_cost_per_token: Optional[CostPerToken]: the cost per input + output token for the llm api call.
        custom_cost_per_second: Optional[float]: the cost per second for the llm api call.
    Returns:
        float: The cost in USD dollars for the completion based on the provided parameters.
    Exceptions:
        Raises exception if model not in the litellm model cost map. Register model, via custom pricing or PR - https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json
    Note:
        - If completion_response is provided, the function extracts token information and the model name from it.
        - If completion_response is not provided, the function calculates token counts based on the model and input text.
        - The cost is calculated based on the model, prompt tokens, and completion tokens.
        - For certain models containing "togethercomputer" in the name, prices are based on the model size.
        - For un-mapped Replicate models, the cost is calculated based on the total time used for the request.
    """
    try:
        if (
            (call_type == "aimage_generation" or call_type == "image_generation")
            and model is not None
            and isinstance(model, str)
            and len(model) == 0
            and custom_llm_provider == "azure"
        ):
            model = "dall-e-2"  # for dall-e-2, azure expects an empty model name
        # Handle Inputs to completion_cost
        prompt_tokens = 0
        completion_tokens = 0
        custom_llm_provider = None
        if completion_response is not None:
            # get input/output tokens from completion_response
            prompt_tokens = completion_response.get("usage", {}).get("prompt_tokens", 0)
            completion_tokens = completion_response.get("usage", {}).get(
                "completion_tokens", 0
            )
            total_time = completion_response.get("_response_ms", 0)
            verbose_logger.debug(
                f"completion_response response ms: {completion_response.get('_response_ms')} "
            )
            model = model or completion_response.get(
                "model", None
            )  # check if user passed an override for model, if it's none check completion_response['model']
            if hasattr(completion_response, "_hidden_params"):
                if (
                    completion_response._hidden_params.get("model", None) is not None
                    and len(completion_response._hidden_params["model"]) > 0
                ):
                    model = completion_response._hidden_params.get("model", model)
                custom_llm_provider = completion_response._hidden_params.get(
                    "custom_llm_provider", ""
                )
                region_name = completion_response._hidden_params.get(
                    "region_name", region_name
                )
                size = completion_response._hidden_params.get(
                    "optional_params", {}
                ).get(
                    "size", "1024-x-1024"
                )  # openai default
                quality = completion_response._hidden_params.get(
                    "optional_params", {}
                ).get(
                    "quality", "standard"
                )  # openai default
                n = completion_response._hidden_params.get("optional_params", {}).get(
                    "n", 1
                )  # openai default
        else:
            if len(messages) > 0:
                prompt_tokens = token_counter(model=model, messages=messages)
            elif len(prompt) > 0:
                prompt_tokens = token_counter(model=model, text=prompt)
            completion_tokens = token_counter(model=model, text=completion)
        if model is None:
            raise ValueError(
                f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}"
            )
        if (
            call_type == CallTypes.image_generation.value
            or call_type == CallTypes.aimage_generation.value
        ):
            ### IMAGE GENERATION COST CALCULATION ###
            if custom_llm_provider == "vertex_ai":
                # https://cloud.google.com/vertex-ai/generative-ai/pricing
                # Vertex Charges Flat $0.20 per image
                return 0.020
            # fix size to match naming convention
            if "x" in size and "-x-" not in size:
                size = size.replace("x", "-x-")
            image_gen_model_name = f"{size}/{model}"
            image_gen_model_name_with_quality = image_gen_model_name
            if quality is not None:
                image_gen_model_name_with_quality = f"{quality}/{image_gen_model_name}"
            size = size.split("-x-")
            height = int(size[0])  # if it's 1024-x-1024 vs. 1024x1024
            width = int(size[1])
            verbose_logger.debug(f"image_gen_model_name: {image_gen_model_name}")
            verbose_logger.debug(
                f"image_gen_model_name_with_quality: {image_gen_model_name_with_quality}"
            )
            if image_gen_model_name in litellm.model_cost:
                return (
                    litellm.model_cost[image_gen_model_name]["input_cost_per_pixel"]
                    * height
                    * width
                    * n
                )
            elif image_gen_model_name_with_quality in litellm.model_cost:
                return (
                    litellm.model_cost[image_gen_model_name_with_quality][
                        "input_cost_per_pixel"
                    ]
                    * height
                    * width
                    * n
                )
            else:
                raise Exception(
                    f"Model={image_gen_model_name} not found in completion cost model map"
                )
        # Calculate cost based on prompt_tokens, completion_tokens
        if (
            "togethercomputer" in model
            or "together_ai" in model
            or custom_llm_provider == "together_ai"
        ):
            # together ai prices based on size of llm
            # get_model_params_and_category takes a model name and returns the category of LLM size it is in model_prices_and_context_window.json
            model = get_model_params_and_category(model)
        # replicate llms are calculate based on time for request running
        # see https://replicate.com/pricing
        elif (
            model in litellm.replicate_models or "replicate" in model
        ) and model not in litellm.model_cost:
            # for unmapped replicate model, default to replicate's time tracking logic
            return get_replicate_completion_pricing(completion_response, total_time)
        if model is None:
            raise ValueError(
                f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}"
            )
        (
            prompt_tokens_cost_usd_dollar,
            completion_tokens_cost_usd_dollar,
        ) = cost_per_token(
            model=model,
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
            custom_llm_provider=custom_llm_provider,
            response_time_ms=total_time,
            region_name=region_name,
            custom_cost_per_second=custom_cost_per_second,
            custom_cost_per_token=custom_cost_per_token,
        )
        _final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
        print_verbose(
            f"final cost: {_final_cost}; prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}; completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}"
        )
        return _final_cost
    except Exception as e:
        raise e
 def response_cost_calculator(
@ -47,7 +319,7 @@ def response_cost_calculator(
 ) -> Optional[float]:
    try:
        response_cost: float = 0.0
-        if cache_hit is not None and cache_hit == True:
+        if cache_hit is not None and cache_hit is True:
            response_cost = 0.0
        else:
            response_object._hidden_params["optional_params"] = optional_params
@ -62,9 +334,11 @@ def response_cost_calculator(
                if (
                    model in litellm.model_cost
                    and custom_pricing is not None
-                    and custom_llm_provider == True
+                    and custom_llm_provider is True
                ):  # override defaults if custom pricing is set
                    base_model = model
                elif base_model is None:
                    base_model = model
                # base_model defaults to None if not set on model_info
                response_cost = completion_cost(
                    completion_response=response_object,
--- a/litellm/exceptions.py
+++ b/litellm/exceptions.py
@ -20,7 +20,7 @@ class AuthenticationError(openai.AuthenticationError):  # type: ignore
        message,
        llm_provider,
        model,
-        response: httpx.Response,
+        response: Optional[httpx.Response] = None,
        litellm_debug_info: Optional[str] = None,
        max_retries: Optional[int] = None,
        num_retries: Optional[int] = None,
@ -32,8 +32,14 @@ class AuthenticationError(openai.AuthenticationError):  # type: ignore
        self.litellm_debug_info = litellm_debug_info
        self.max_retries = max_retries
        self.num_retries = num_retries
        self.response = response or httpx.Response(
            status_code=self.status_code,
            request=httpx.Request(
                method="GET", url="https://litellm.ai"
            ),  # mock request object
        )
        super().__init__(
-            self.message, response=response, body=None
+            self.message, response=self.response, body=None
        )  # Call the base class constructor with the parameters it needs
    def __str__(self):
@ -60,7 +66,7 @@ class NotFoundError(openai.NotFoundError):  # type: ignore
        message,
        model,
        llm_provider,
-        response: httpx.Response,
+        response: Optional[httpx.Response] = None,
        litellm_debug_info: Optional[str] = None,
        max_retries: Optional[int] = None,
        num_retries: Optional[int] = None,
@ -72,8 +78,14 @@ class NotFoundError(openai.NotFoundError):  # type: ignore
        self.litellm_debug_info = litellm_debug_info
        self.max_retries = max_retries
        self.num_retries = num_retries
        self.response = response or httpx.Response(
            status_code=self.status_code,
            request=httpx.Request(
                method="GET", url="https://litellm.ai"
            ),  # mock request object
        )
        super().__init__(
-            self.message, response=response, body=None
+            self.message, response=self.response, body=None
        )  # Call the base class constructor with the parameters it needs
    def __str__(self):
@ -262,7 +274,7 @@ class RateLimitError(openai.RateLimitError):  # type: ignore
        message,
        llm_provider,
        model,
-        response: httpx.Response,
+        response: Optional[httpx.Response] = None,
        litellm_debug_info: Optional[str] = None,
        max_retries: Optional[int] = None,
        num_retries: Optional[int] = None,
@ -274,8 +286,18 @@ class RateLimitError(openai.RateLimitError):  # type: ignore
        self.litellm_debug_info = litellm_debug_info
        self.max_retries = max_retries
        self.num_retries = num_retries
        if response is None:
            self.response = httpx.Response(
                status_code=429,
                request=httpx.Request(
                    method="POST",
                    url=" https://cloud.google.com/vertex-ai/",
                ),
            )
        else:
            self.response = response
        super().__init__(
-            self.message, response=response, body=None
+            self.message, response=self.response, body=None
        )  # Call the base class constructor with the parameters it needs
    def __str__(self):
@ -421,7 +443,7 @@ class ServiceUnavailableError(openai.APIStatusError):  # type: ignore
        message,
        llm_provider,
        model,
-        response: httpx.Response,
+        response: Optional[httpx.Response] = None,
        litellm_debug_info: Optional[str] = None,
        max_retries: Optional[int] = None,
        num_retries: Optional[int] = None,
@ -433,8 +455,18 @@ class ServiceUnavailableError(openai.APIStatusError):  # type: ignore
        self.litellm_debug_info = litellm_debug_info
        self.max_retries = max_retries
        self.num_retries = num_retries
        if response is None:
            self.response = httpx.Response(
                status_code=self.status_code,
                request=httpx.Request(
                    method="POST",
                    url=" https://cloud.google.com/vertex-ai/",
                ),
            )
        else:
            self.response = response
        super().__init__(
-            self.message, response=response, body=None
+            self.message, response=self.response, body=None
        )  # Call the base class constructor with the parameters it needs
    def __str__(self):
@ -460,7 +492,7 @@ class InternalServerError(openai.InternalServerError):  # type: ignore
        message,
        llm_provider,
        model,
-        response: httpx.Response,
+        response: Optional[httpx.Response] = None,
        litellm_debug_info: Optional[str] = None,
        max_retries: Optional[int] = None,
        num_retries: Optional[int] = None,
@ -472,8 +504,18 @@ class InternalServerError(openai.InternalServerError):  # type: ignore
        self.litellm_debug_info = litellm_debug_info
        self.max_retries = max_retries
        self.num_retries = num_retries
        if response is None:
            self.response = httpx.Response(
                status_code=self.status_code,
                request=httpx.Request(
                    method="POST",
                    url=" https://cloud.google.com/vertex-ai/",
                ),
            )
        else:
            self.response = response
        super().__init__(
-            self.message, response=response, body=None
+            self.message, response=self.response, body=None
        )  # Call the base class constructor with the parameters it needs
    def __str__(self):
--- a/litellm/integrations/opentelemetry.py
+++ b/litellm/integrations/opentelemetry.py
@ -366,8 +366,6 @@ class OpenTelemetry(CustomLogger):
                        )
                    message = choice.get("message")
                    if not isinstance(message, dict):
                        message = message.dict()
                    tool_calls = message.get("tool_calls")
                    if tool_calls:
                        span.set_attribute(
--- a/litellm/integrations/test_httpx.py
+++ b/litellm/integrations/test_httpx.py
--- a/litellm/llms/predibase.py
+++ b/litellm/llms/predibase.py
@ -3,6 +3,7 @@
 from functools import partial
 import os, types
 import traceback
 import json
 from enum import Enum
 import requests, copy  # type: ignore
@ -242,12 +243,12 @@ class PredibaseChatCompletion(BaseLLM):
                "details" in completion_response
                and "tokens" in completion_response["details"]
            ):
-                model_response.choices[0].finish_reason = completion_response[
+                model_response.choices[0].finish_reason = map_finish_reason(
-                    "details"
+                    completion_response["details"]["finish_reason"]
-                ]["finish_reason"]
+                )
                sum_logprob = 0
                for token in completion_response["details"]["tokens"]:
-                    if token["logprob"] != None:
+                    if token["logprob"] is not None:
                        sum_logprob += token["logprob"]
                model_response["choices"][0][
                    "message"
@ -265,7 +266,7 @@ class PredibaseChatCompletion(BaseLLM):
                    ):
                        sum_logprob = 0
                        for token in item["tokens"]:
-                            if token["logprob"] != None:
+                            if token["logprob"] is not None:
                                sum_logprob += token["logprob"]
                        if len(item["generated_text"]) > 0:
                            message_obj = Message(
@ -275,7 +276,7 @@ class PredibaseChatCompletion(BaseLLM):
                        else:
                            message_obj = Message(content=None)
                        choice_obj = Choices(
-                            finish_reason=item["finish_reason"],
+                            finish_reason=map_finish_reason(item["finish_reason"]),
                            index=idx + 1,
                            message=message_obj,
                        )
@ -285,10 +286,8 @@ class PredibaseChatCompletion(BaseLLM):
        ## CALCULATING USAGE
        prompt_tokens = 0
        try:
-            prompt_tokens = len(
+            prompt_tokens = litellm.token_counter(messages=messages)
-                encoding.encode(model_response["choices"][0]["message"]["content"])
+        except Exception:
            )  ##[TODO] use a model-specific tokenizer here
        except:
            # this should remain non blocking we should not block a response returning if calculating usage fails
            pass
        output_text = model_response["choices"][0]["message"].get("content", "")
@ -331,6 +330,7 @@ class PredibaseChatCompletion(BaseLLM):
        logging_obj,
        optional_params: dict,
        tenant_id: str,
        timeout: Union[float, httpx.Timeout],
        acompletion=None,
        litellm_params=None,
        logger_fn=None,
@ -340,6 +340,7 @@ class PredibaseChatCompletion(BaseLLM):
        completion_url = ""
        input_text = ""
        base_url = "https://serving.app.predibase.com"
        if "https" in model:
            completion_url = model
        elif api_base:
@ -349,7 +350,7 @@ class PredibaseChatCompletion(BaseLLM):
        completion_url = f"{base_url}/{tenant_id}/deployments/v2/llms/{model}"
-        if optional_params.get("stream", False) == True:
+        if optional_params.get("stream", False) is True:
            completion_url += "/generate_stream"
        else:
            completion_url += "/generate"
@ -393,9 +394,9 @@ class PredibaseChatCompletion(BaseLLM):
            },
        )
        ## COMPLETION CALL
-        if acompletion == True:
+        if acompletion is True:
            ### ASYNC STREAMING
-            if stream == True:
+            if stream is True:
                return self.async_streaming(
                    model=model,
                    messages=messages,
@ -410,6 +411,7 @@ class PredibaseChatCompletion(BaseLLM):
                    litellm_params=litellm_params,
                    logger_fn=logger_fn,
                    headers=headers,
                    timeout=timeout,
                )  # type: ignore
            else:
                ### ASYNC COMPLETION
@ -428,10 +430,11 @@ class PredibaseChatCompletion(BaseLLM):
                    litellm_params=litellm_params,
                    logger_fn=logger_fn,
                    headers=headers,
                    timeout=timeout,
                )  # type: ignore
        ### SYNC STREAMING
-        if stream == True:
+        if stream is True:
            response = requests.post(
                completion_url,
                headers=headers,
@ -452,7 +455,6 @@ class PredibaseChatCompletion(BaseLLM):
                headers=headers,
                data=json.dumps(data),
            )
        return self.process_response(
            model=model,
            response=response,
@ -480,23 +482,26 @@ class PredibaseChatCompletion(BaseLLM):
        stream,
        data: dict,
        optional_params: dict,
        timeout: Union[float, httpx.Timeout],
        litellm_params=None,
        logger_fn=None,
        headers={},
    ) -> ModelResponse:
-        self.async_handler = AsyncHTTPHandler(
+
-            timeout=httpx.Timeout(timeout=600.0, connect=5.0)
+        async_handler = AsyncHTTPHandler(timeout=httpx.Timeout(timeout=timeout))
        )
        try:
-            response = await self.async_handler.post(
+            response = await async_handler.post(
                api_base, headers=headers, data=json.dumps(data)
            )
        except httpx.HTTPStatusError as e:
            raise PredibaseError(
-                status_code=e.response.status_code, message=e.response.text
+                status_code=e.response.status_code,
                message="HTTPStatusError - {}".format(e.response.text),
            )
        except Exception as e:
-            raise PredibaseError(status_code=500, message=str(e))
+            raise PredibaseError(
                status_code=500, message="{}\n{}".format(str(e), traceback.format_exc())
            )
        return self.process_response(
            model=model,
            response=response,
@ -522,6 +527,7 @@ class PredibaseChatCompletion(BaseLLM):
        api_key,
        logging_obj,
        data: dict,
        timeout: Union[float, httpx.Timeout],
        optional_params=None,
        litellm_params=None,
        logger_fn=None,
--- a/litellm/main.py
+++ b/litellm/main.py
@ -432,9 +432,9 @@ def mock_completion(
            if isinstance(mock_response, openai.APIError):
                raise mock_response
            raise litellm.APIError(
-                status_code=500,  # type: ignore
+                status_code=getattr(mock_response, "status_code", 500),  # type: ignore
-                message=str(mock_response),
+                message=getattr(mock_response, "text", str(mock_response)),
-                llm_provider="openai",  # type: ignore
+                llm_provider=getattr(mock_response, "llm_provider", "openai"),  # type: ignore
                model=model,  # type: ignore
                request=httpx.Request(method="POST", url="https://api.openai.com/v1/"),
            )
@ -1949,7 +1949,8 @@ def completion(
            )
            api_base = (
-                optional_params.pop("api_base", None)
+                api_base
                or optional_params.pop("api_base", None)
                or optional_params.pop("base_url", None)
                or litellm.api_base
                or get_secret("PREDIBASE_API_BASE")
@ -1977,12 +1978,13 @@ def completion(
                custom_prompt_dict=custom_prompt_dict,
                api_key=api_key,
                tenant_id=tenant_id,
                timeout=timeout,
            )
            if (
                "stream" in optional_params
-                and optional_params["stream"] == True
+                and optional_params["stream"] is True
-                and acompletion == False
+                and acompletion is False
            ):
                return _model_response
            response = _model_response
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -3009,32 +3009,37 @@
        "litellm_provider": "sagemaker",
        "mode": "chat"
    },
-    "together-ai-up-to-3b": {
+    "together-ai-up-to-4b": {
        "input_cost_per_token": 0.0000001,
        "output_cost_per_token": 0.0000001,
        "litellm_provider": "together_ai"
    },
-    "together-ai-3.1b-7b": {
+    "together-ai-4.1b-8b": {
        "input_cost_per_token": 0.0000002,
        "output_cost_per_token": 0.0000002,
        "litellm_provider": "together_ai"
    },
-    "together-ai-7.1b-20b": {
+    "together-ai-8.1b-21b": {
        "max_tokens": 1000,
-        "input_cost_per_token": 0.0000004,
+        "input_cost_per_token": 0.0000003,
-        "output_cost_per_token": 0.0000004,
+        "output_cost_per_token": 0.0000003,
        "litellm_provider": "together_ai"
    },
-    "together-ai-20.1b-40b": {
+    "together-ai-21.1b-41b": {
        "input_cost_per_token": 0.0000008,
        "output_cost_per_token": 0.0000008,
        "litellm_provider": "together_ai"
    },
-    "together-ai-40.1b-70b": {
+    "together-ai-41.1b-80b": {
        "input_cost_per_token": 0.0000009,
        "output_cost_per_token": 0.0000009,
        "litellm_provider": "together_ai"
    },
    "together-ai-81.1b-110b": {
        "input_cost_per_token": 0.0000018,
        "output_cost_per_token": 0.0000018,
        "litellm_provider": "together_ai"
    },
    "together_ai/mistralai/Mixtral-8x7B-Instruct-v0.1": {
        "input_cost_per_token": 0.0000006,
        "output_cost_per_token": 0.0000006,
--- a/litellm/proxy/_logging.py
+++ b/litellm/proxy/_logging.py
@ -1,7 +1,12 @@
 import json
 import logging
 from logging import Formatter
-import sys
+import os
 from litellm import json_logs
 # Set default log level to INFO
 log_level = os.getenv("LITELLM_LOG", "INFO")
 numeric_level: str = getattr(logging, log_level.upper())
 class JsonFormatter(Formatter):
@ -16,6 +21,14 @@ class JsonFormatter(Formatter):
 logger = logging.root
 handler = logging.StreamHandler()
-handler.setFormatter(JsonFormatter())
+if json_logs:
    handler.setFormatter(JsonFormatter())
 else:
    formatter = logging.Formatter(
        "\033[92m%(asctime)s - %(name)s:%(levelname)s\033[0m: %(filename)s:%(lineno)s - %(message)s",
        datefmt="%H:%M:%S",
    )
    handler.setFormatter(formatter)
 logger.handlers = [handler]
-logger.setLevel(logging.INFO)
+logger.setLevel(numeric_level)
--- a/litellm/proxy/_super_secret_config.yaml
+++ b/litellm/proxy/_super_secret_config.yaml
@ -8,6 +8,17 @@ model_list:
 - model_name: llama3-70b-8192
  litellm_params:
    model: groq/llama3-70b-8192
 - model_name: fake-openai-endpoint
  litellm_params:
    model: predibase/llama-3-8b-instruct
    api_base: "http://0.0.0.0:8081"
    api_key: os.environ/PREDIBASE_API_KEY
    tenant_id: os.environ/PREDIBASE_TENANT_ID
    max_retries: 0
    temperature: 0.1
    max_new_tokens: 256
    return_full_text: false
 # - litellm_params:
 #     api_base: https://my-endpoint-europe-berri-992.openai.azure.com/
 #     api_key: os.environ/AZURE_EUROPE_API_KEY
@ -57,6 +68,8 @@ router_settings:
 litellm_settings:
  success_callback: ["langfuse"]
  cache: True
  failure_callback: ["langfuse"]
 general_settings:
  alerting: ["email"]
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -160,6 +160,7 @@ from litellm.proxy.auth.auth_checks import (
    get_user_object,
    allowed_routes_check,
    get_actual_routes,
    log_to_opentelemetry,
 )
 from litellm.llms.custom_httpx.httpx_handler import HTTPHandler
 from litellm.exceptions import RejectedRequestError
@ -368,6 +369,11 @@ from typing import Dict
 api_key_header = APIKeyHeader(
    name="Authorization", auto_error=False, description="Bearer token"
 )
 azure_api_key_header = APIKeyHeader(
    name="API-Key",
    auto_error=False,
    description="Some older versions of the openai Python package will send an API-Key header with just the API key ",
 )
 user_api_base = None
 user_model = None
 user_debug = False
@ -508,13 +514,19 @@ async def check_request_disconnection(request: Request, llm_api_call_task):
 async def user_api_key_auth(
-    request: Request, api_key: str = fastapi.Security(api_key_header)
+    request: Request,
    api_key: str = fastapi.Security(api_key_header),
    azure_api_key_header: str = fastapi.Security(azure_api_key_header),
 ) -> UserAPIKeyAuth:
    global master_key, prisma_client, llm_model_list, user_custom_auth, custom_db_client, general_settings, proxy_logging_obj
    try:
        if isinstance(api_key, str):
            passed_in_key = api_key
            api_key = _get_bearer_token(api_key=api_key)
        elif isinstance(azure_api_key_header, str):
            api_key = azure_api_key_header
        parent_otel_span: Optional[Span] = None
        if open_telemetry_logger is not None:
            parent_otel_span = open_telemetry_logger.tracer.start_span(
@ -1495,7 +1507,7 @@ async def user_api_key_auth(
                    )
        if valid_token is None:
            # No token was found when looking up in the DB
-            raise Exception("Invalid token passed")
+            raise Exception("Invalid proxy server token passed")
        if valid_token_dict is not None:
            if user_id_information is not None and _is_user_proxy_admin(
                user_id_information
@ -1528,6 +1540,14 @@ async def user_api_key_auth(
                str(e)
            )
        )
        # Log this exception to OTEL
        if open_telemetry_logger is not None:
            await open_telemetry_logger.async_post_call_failure_hook(
                original_exception=e,
                user_api_key_dict=UserAPIKeyAuth(parent_otel_span=parent_otel_span),
            )
        verbose_proxy_logger.debug(traceback.format_exc())
        if isinstance(e, litellm.BudgetExceededError):
            raise ProxyException(
@ -7803,6 +7823,10 @@ async def get_global_spend_report(
        default=None,
        description="Time till which to view spend",
    ),
    group_by: Optional[Literal["team", "customer"]] = fastapi.Query(
        default="team",
        description="Group spend by internal team or customer",
    ),
 ):
    """
    Get Daily Spend per Team, based on specific startTime and endTime. Per team, view usage by each key, model
@ -7849,69 +7873,130 @@ async def get_global_spend_report(
                f"Database not connected. Connect a database to your proxy - https://docs.litellm.ai/docs/simple_proxy#managing-auth---virtual-keys"
            )
-        # first get data from spend logs -> SpendByModelApiKey
+        if group_by == "team":
-        # then read data from "SpendByModelApiKey" to format the response obj
+            # first get data from spend logs -> SpendByModelApiKey
-        sql_query = """
+            # then read data from "SpendByModelApiKey" to format the response obj
            sql_query = """
-        WITH SpendByModelApiKey AS (
+            WITH SpendByModelApiKey AS (
-            SELECT
+                SELECT
-                date_trunc('day', sl."startTime") AS group_by_day,
+                    date_trunc('day', sl."startTime") AS group_by_day,
-                COALESCE(tt.team_alias, 'Unassigned Team') AS team_name,
+                    COALESCE(tt.team_alias, 'Unassigned Team') AS team_name,
-                sl.model,
+                    sl.model,
-                sl.api_key,
+                    sl.api_key,
-                SUM(sl.spend) AS model_api_spend,
+                    SUM(sl.spend) AS model_api_spend,
-                SUM(sl.total_tokens) AS model_api_tokens
+                    SUM(sl.total_tokens) AS model_api_tokens
-            FROM 
+                FROM 
-                "LiteLLM_SpendLogs" sl
+                    "LiteLLM_SpendLogs" sl
-            LEFT JOIN 
+                LEFT JOIN 
-                "LiteLLM_TeamTable" tt 
+                    "LiteLLM_TeamTable" tt 
-            ON 
+                ON 
-                sl.team_id = tt.team_id
+                    sl.team_id = tt.team_id
-            WHERE
+                WHERE
-                sl."startTime" BETWEEN $1::date AND $2::date
+                    sl."startTime" BETWEEN $1::date AND $2::date
-            GROUP BY
+                GROUP BY
-                date_trunc('day', sl."startTime"),
+                    date_trunc('day', sl."startTime"),
-                tt.team_alias,
+                    tt.team_alias,
-                sl.model,
+                    sl.model,
-                sl.api_key
+                    sl.api_key
-        )
+            )
                SELECT
                    group_by_day,
                    jsonb_agg(jsonb_build_object(
                        'team_name', team_name,
                        'total_spend', total_spend,
                        'metadata', metadata
                    )) AS teams
                FROM (
                    SELECT
                        group_by_day,
                        team_name,
                        SUM(model_api_spend) AS total_spend,
                        jsonb_agg(jsonb_build_object(
                            'model', model,
                            'api_key', api_key,
                            'spend', model_api_spend,
                            'total_tokens', model_api_tokens
                        )) AS metadata
                    FROM 
                        SpendByModelApiKey
                    GROUP BY
                        group_by_day,
                        team_name
                ) AS aggregated
                GROUP BY
                    group_by_day
                ORDER BY
                    group_by_day;
                """
            db_response = await prisma_client.db.query_raw(
                sql_query, start_date_obj, end_date_obj
            )
            if db_response is None:
                return []
            return db_response
        elif group_by == "customer":
            sql_query = """
            WITH SpendByModelApiKey AS (
                SELECT
                    date_trunc('day', sl."startTime") AS group_by_day,
                    sl.end_user AS customer,
                    sl.model,
                    sl.api_key,
                    SUM(sl.spend) AS model_api_spend,
                    SUM(sl.total_tokens) AS model_api_tokens
                FROM
                    "LiteLLM_SpendLogs" sl
                WHERE
                    sl."startTime" BETWEEN $1::date AND $2::date
                GROUP BY
                    date_trunc('day', sl."startTime"),
                    customer,
                    sl.model,
                    sl.api_key
            )
            SELECT
                group_by_day,
                jsonb_agg(jsonb_build_object(
-                    'team_name', team_name,
+                    'customer', customer,
                    'total_spend', total_spend,
                    'metadata', metadata
-                )) AS teams
+                )) AS customers
-            FROM (
+            FROM
-                SELECT
+                (
-                    group_by_day,
+                    SELECT
-                    team_name,
+                        group_by_day,
-                    SUM(model_api_spend) AS total_spend,
+                        customer,
-                    jsonb_agg(jsonb_build_object(
+                        SUM(model_api_spend) AS total_spend,
-                        'model', model,
+                        jsonb_agg(jsonb_build_object(
-                        'api_key', api_key,
+                            'model', model,
-                        'spend', model_api_spend,
+                            'api_key', api_key,
-                        'total_tokens', model_api_tokens
+                            'spend', model_api_spend,
-                    )) AS metadata
+                            'total_tokens', model_api_tokens
-                FROM 
+                        )) AS metadata
-                    SpendByModelApiKey
+                    FROM
-                GROUP BY
+                        SpendByModelApiKey
-                    group_by_day,
+                    GROUP BY
-                    team_name
+                        group_by_day,
-            ) AS aggregated
+                        customer
                ) AS aggregated
            GROUP BY
                group_by_day
            ORDER BY
                group_by_day;
-            """
+                """
-        db_response = await prisma_client.db.query_raw(
+            db_response = await prisma_client.db.query_raw(
-            sql_query, start_date_obj, end_date_obj
+                sql_query, start_date_obj, end_date_obj
-        )
+            )
-        if db_response is None:
+            if db_response is None:
-            return []
+                return []
-        return db_response
+            return db_response
    except Exception as e:
        raise HTTPException(
--- a/litellm/router.py
+++ b/litellm/router.py
@ -2056,12 +2056,15 @@ class Router:
                    verbose_router_logger.debug(f"inside model fallbacks: {fallbacks}")
                    generic_fallback_idx: Optional[int] = None
                    ## check for specific model group-specific fallbacks
-                    for idx, item in enumerate(fallbacks):
+                    if isinstance(fallbacks, list):
-                        if list(item.keys())[0] == model_group:
+                        fallback_model_group = fallbacks
-                            fallback_model_group = item[model_group]
+                    elif isinstance(fallbacks, dict):
-                            break
+                        for idx, item in enumerate(fallbacks):
-                        elif list(item.keys())[0] == "*":
+                            if list(item.keys())[0] == model_group:
-                            generic_fallback_idx = idx
+                                fallback_model_group = item[model_group]
                                break
                            elif list(item.keys())[0] == "*":
                                generic_fallback_idx = idx
                    ## if none, check for generic fallback
                    if (
                        fallback_model_group is None
@ -2310,13 +2313,16 @@ class Router:
                    verbose_router_logger.debug(f"inside model fallbacks: {fallbacks}")
                    fallback_model_group = None
                    generic_fallback_idx: Optional[int] = None
-                    ## check for specific model group-specific fallbacks
+                    if isinstance(fallbacks, list):
-                    for idx, item in enumerate(fallbacks):
+                        fallback_model_group = fallbacks
-                        if list(item.keys())[0] == model_group:
+                    elif isinstance(fallbacks, dict):
-                            fallback_model_group = item[model_group]
+                        ## check for specific model group-specific fallbacks
-                            break
+                        for idx, item in enumerate(fallbacks):
-                        elif list(item.keys())[0] == "*":
+                            if list(item.keys())[0] == model_group:
-                            generic_fallback_idx = idx
+                                fallback_model_group = item[model_group]
                                break
                            elif list(item.keys())[0] == "*":
                                generic_fallback_idx = idx
                    ## if none, check for generic fallback
                    if (
                        fallback_model_group is None
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -345,7 +345,7 @@ def test_completion_claude_3_function_call(model):
            drop_params=True,
        )
-        # Add any assertions, here to check response args
+        # Add any assertions here to check response args
        print(response)
        assert isinstance(response.choices[0].message.tool_calls[0].function.name, str)
        assert isinstance(
@ -530,6 +530,7 @@ def test_completion_cohere_command_r_plus_function_call():
            messages=messages,
            tools=tools,
            tool_choice="auto",
            force_single_step=True,
        )
        print(second_response)
    except Exception as e:
--- a/litellm/tests/test_completion_cost.py
+++ b/litellm/tests/test_completion_cost.py
@ -517,3 +517,51 @@ def test_groq_response_cost_tracking(is_streaming):
    assert response_cost > 0.0
    print(f"response_cost: {response_cost}")
 def test_together_ai_qwen_completion_cost():
    input_kwargs = {
        "completion_response": litellm.ModelResponse(
            **{
                "id": "890db0c33c4ef94b-SJC",
                "choices": [
                    {
                        "finish_reason": "eos",
                        "index": 0,
                        "message": {
                            "content": "I am Qwen, a large language model created by Alibaba Cloud.",
                            "role": "assistant",
                        },
                    }
                ],
                "created": 1717900130,
                "model": "together_ai/qwen/Qwen2-72B-Instruct",
                "object": "chat.completion",
                "system_fingerprint": None,
                "usage": {
                    "completion_tokens": 15,
                    "prompt_tokens": 23,
                    "total_tokens": 38,
                },
            }
        ),
        "model": "qwen/Qwen2-72B-Instruct",
        "prompt": "",
        "messages": [],
        "completion": "",
        "total_time": 0.0,
        "call_type": "completion",
        "custom_llm_provider": "together_ai",
        "region_name": None,
        "size": None,
        "quality": None,
        "n": None,
        "custom_cost_per_token": None,
        "custom_cost_per_second": None,
    }
    response = litellm.cost_calculator.get_model_params_and_category(
        model_name="qwen/Qwen2-72B-Instruct"
    )
    assert response == "together-ai-41.1b-80b"
--- a/litellm/tests/test_exceptions.py
+++ b/litellm/tests/test_exceptions.py
@ -3,6 +3,7 @@ import os
 import sys
 import traceback
 import subprocess, asyncio
 from typing import Any
 sys.path.insert(
    0, os.path.abspath("../..")
@ -19,6 +20,7 @@ from litellm import (
 )
 from concurrent.futures import ThreadPoolExecutor
 import pytest
 from unittest.mock import patch, MagicMock
 litellm.vertex_project = "pathrise-convert-1606954137718"
 litellm.vertex_location = "us-central1"
@ -655,3 +657,47 @@ def test_litellm_predibase_exception():
 # accuracy_score = counts[True]/(counts[True] + counts[False])
 # print(f"accuracy_score: {accuracy_score}")
@pytest.mark.parametrize("provider", ["predibase"])
 def test_exception_mapping(provider):
    """
    For predibase, run through a set of mock exceptions
    assert that they are being mapped correctly
    """
    litellm.set_verbose = True
    error_map = {
        400: litellm.BadRequestError,
        401: litellm.AuthenticationError,
        404: litellm.NotFoundError,
        408: litellm.Timeout,
        429: litellm.RateLimitError,
        500: litellm.InternalServerError,
        503: litellm.ServiceUnavailableError,
    }
    for code, expected_exception in error_map.items():
        mock_response = Exception()
        setattr(mock_response, "text", "This is an error message")
        setattr(mock_response, "llm_provider", provider)
        setattr(mock_response, "status_code", code)
        response: Any = None
        try:
            response = completion(
                model="{}/test-model".format(provider),
                messages=[{"role": "user", "content": "Hey, how's it going?"}],
                mock_response=mock_response,
            )
        except expected_exception:
            continue
        except Exception as e:
            response = "{}\n{}".format(str(e), traceback.format_exc())
        pytest.fail(
            "Did not raise expected exception. Expected={}, Return={},".format(
                expected_exception, response
            )
        )
    pass
--- a/litellm/tests/test_key_generate_prisma.py
+++ b/litellm/tests/test_key_generate_prisma.py
@ -272,7 +272,7 @@ def test_call_with_invalid_key(prisma_client):
    except Exception as e:
        print("Got Exception", e)
        print(e.message)
-        assert "Authentication Error, Invalid token passed" in e.message
+        assert "Authentication Error, Invalid proxy server token passed" in e.message
        pass
--- a/litellm/tests/test_router_fallbacks.py
+++ b/litellm/tests/test_router_fallbacks.py
@ -1059,3 +1059,53 @@ async def test_default_model_fallbacks(sync_mode, litellm_module_fallbacks):
    assert isinstance(response, litellm.ModelResponse)
    assert response.model is not None and response.model == "gpt-4o"
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
 async def test_client_side_fallbacks_list(sync_mode):
    """
    Tests Client Side Fallbacks
    User can pass "fallbacks": ["gpt-3.5-turbo"] and this should work
    """
    router = Router(
        model_list=[
            {
                "model_name": "bad-model",
                "litellm_params": {
                    "model": "openai/my-bad-model",
                    "api_key": "my-bad-api-key",
                },
            },
            {
                "model_name": "my-good-model",
                "litellm_params": {
                    "model": "gpt-4o",
                    "api_key": os.getenv("OPENAI_API_KEY"),
                },
            },
        ],
    )
    if sync_mode:
        response = router.completion(
            model="bad-model",
            messages=[{"role": "user", "content": "Hey, how's it going?"}],
            fallbacks=["my-good-model"],
            mock_testing_fallbacks=True,
            mock_response="Hey! nice day",
        )
    else:
        response = await router.acompletion(
            model="bad-model",
            messages=[{"role": "user", "content": "Hey, how's it going?"}],
            fallbacks=["my-good-model"],
            mock_testing_fallbacks=True,
            mock_response="Hey! nice day",
        )
    assert isinstance(response, litellm.ModelResponse)
    assert response.model is not None and response.model == "gpt-4o"
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -326,6 +326,22 @@ class Function(OpenAIObject):
        super(Function, self).__init__(**data)
    def __contains__(self, key):
        # Define custom behavior for the 'in' operator
        return hasattr(self, key)
    def get(self, key, default=None):
        # Custom .get() method to access attributes with a default value if the attribute doesn't exist
        return getattr(self, key, default)
    def __getitem__(self, key):
        # Allow dictionary-style access to attributes
        return getattr(self, key)
    def __setitem__(self, key, value):
        # Allow dictionary-style assignment of attributes
        setattr(self, key, value)
 class ChatCompletionDeltaToolCall(OpenAIObject):
    id: Optional[str] = None
@ -385,6 +401,22 @@ class ChatCompletionMessageToolCall(OpenAIObject):
        else:
            self.type = "function"
    def __contains__(self, key):
        # Define custom behavior for the 'in' operator
        return hasattr(self, key)
    def get(self, key, default=None):
        # Custom .get() method to access attributes with a default value if the attribute doesn't exist
        return getattr(self, key, default)
    def __getitem__(self, key):
        # Allow dictionary-style access to attributes
        return getattr(self, key)
    def __setitem__(self, key, value):
        # Allow dictionary-style assignment of attributes
        setattr(self, key, value)
 class Message(OpenAIObject):
    def __init__(
@ -3929,54 +3961,6 @@ def client(original_function):
        return wrapper
 ####### USAGE CALCULATOR ################
 # Extract the number of billion parameters from the model name
 # only used for together_computer LLMs
 def get_model_params_and_category(model_name):
    import re
    model_name = model_name.lower()
    params_match = re.search(
        r"(\d+b)", model_name
    )  # catch all decimals like 3b, 70b, etc
    category = None
    if params_match != None:
        params_match = params_match.group(1)
        params_match = params_match.replace("b", "")
        params_billion = float(params_match)
        # Determine the category based on the number of parameters
        if params_billion <= 3.0:
            category = "together-ai-up-to-3b"
        elif params_billion <= 7.0:
            category = "together-ai-3.1b-7b"
        elif params_billion <= 20.0:
            category = "together-ai-7.1b-20b"
        elif params_billion <= 40.0:
            category = "together-ai-20.1b-40b"
        elif params_billion <= 70.0:
            category = "together-ai-40.1b-70b"
        return category
    return None
 def get_replicate_completion_pricing(completion_response=None, total_time=0.0):
    # see https://replicate.com/pricing
    a100_40gb_price_per_second_public = 0.001150
    # for all litellm currently supported LLMs, almost all requests go to a100_80gb
    a100_80gb_price_per_second_public = (
        0.001400  # assume all calls sent to A100 80GB for now
    )
    if total_time == 0.0:  # total time is in ms
        start_time = completion_response["created"]
        end_time = getattr(completion_response, "ended", time.time())
        total_time = end_time - start_time
    return a100_80gb_price_per_second_public * total_time / 1000
@lru_cache(maxsize=128)
 def _select_tokenizer(model: str):
    if model in litellm.cohere_models and "command-r" in model:
@ -4363,7 +4347,7 @@ def _cost_per_token_custom_pricing_helper(
 def cost_per_token(
-    model="",
+    model: str = "",
    prompt_tokens=0,
    completion_tokens=0,
    response_time_ms=None,
@ -4388,6 +4372,8 @@ def cost_per_token(
    Returns:
        tuple: A tuple containing the cost in USD dollars for prompt tokens and completion tokens, respectively.
    """
    if model is None:
        raise Exception("Invalid arg. Model cannot be none.")
    ## CUSTOM PRICING ##
    response_cost = _cost_per_token_custom_pricing_helper(
        prompt_tokens=prompt_tokens,
@ -4560,213 +4546,6 @@ def cost_per_token(
        )
 def completion_cost(
    completion_response=None,
    model=None,
    prompt="",
    messages: List = [],
    completion="",
    total_time=0.0,  # used for replicate, sagemaker
    call_type: Literal[
        "embedding",
        "aembedding",
        "completion",
        "acompletion",
        "atext_completion",
        "text_completion",
        "image_generation",
        "aimage_generation",
        "moderation",
        "amoderation",
        "atranscription",
        "transcription",
        "aspeech",
        "speech",
    ] = "completion",
    ### REGION ###
    custom_llm_provider=None,
    region_name=None,  # used for bedrock pricing
    ### IMAGE GEN ###
    size=None,
    quality=None,
    n=None,  # number of images
    ### CUSTOM PRICING ###
    custom_cost_per_token: Optional[CostPerToken] = None,
    custom_cost_per_second: Optional[float] = None,
 ) -> float:
    """
    Calculate the cost of a given completion call fot GPT-3.5-turbo, llama2, any litellm supported llm.
    Parameters:
        completion_response (litellm.ModelResponses): [Required] The response received from a LiteLLM completion request.
        [OPTIONAL PARAMS]
        model (str): Optional. The name of the language model used in the completion calls
        prompt (str): Optional. The input prompt passed to the llm
        completion (str): Optional. The output completion text from the llm
        total_time (float): Optional. (Only used for Replicate LLMs) The total time used for the request in seconds
        custom_cost_per_token: Optional[CostPerToken]: the cost per input + output token for the llm api call.
        custom_cost_per_second: Optional[float]: the cost per second for the llm api call.
    Returns:
        float: The cost in USD dollars for the completion based on the provided parameters.
    Exceptions:
        Raises exception if model not in the litellm model cost map. Register model, via custom pricing or PR - https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json
    Note:
        - If completion_response is provided, the function extracts token information and the model name from it.
        - If completion_response is not provided, the function calculates token counts based on the model and input text.
        - The cost is calculated based on the model, prompt tokens, and completion tokens.
        - For certain models containing "togethercomputer" in the name, prices are based on the model size.
        - For un-mapped Replicate models, the cost is calculated based on the total time used for the request.
    """
    try:
        if (
            (call_type == "aimage_generation" or call_type == "image_generation")
            and model is not None
            and isinstance(model, str)
            and len(model) == 0
            and custom_llm_provider == "azure"
        ):
            model = "dall-e-2"  # for dall-e-2, azure expects an empty model name
        # Handle Inputs to completion_cost
        prompt_tokens = 0
        completion_tokens = 0
        custom_llm_provider = None
        if completion_response is not None:
            # get input/output tokens from completion_response
            prompt_tokens = completion_response.get("usage", {}).get("prompt_tokens", 0)
            completion_tokens = completion_response.get("usage", {}).get(
                "completion_tokens", 0
            )
            total_time = completion_response.get("_response_ms", 0)
            verbose_logger.debug(
                f"completion_response response ms: {completion_response.get('_response_ms')} "
            )
            model = model or completion_response.get(
                "model", None
            )  # check if user passed an override for model, if it's none check completion_response['model']
            if hasattr(completion_response, "_hidden_params"):
                if (
                    completion_response._hidden_params.get("model", None) is not None
                    and len(completion_response._hidden_params["model"]) > 0
                ):
                    model = completion_response._hidden_params.get("model", model)
                custom_llm_provider = completion_response._hidden_params.get(
                    "custom_llm_provider", ""
                )
                region_name = completion_response._hidden_params.get(
                    "region_name", region_name
                )
                size = completion_response._hidden_params.get(
                    "optional_params", {}
                ).get(
                    "size", "1024-x-1024"
                )  # openai default
                quality = completion_response._hidden_params.get(
                    "optional_params", {}
                ).get(
                    "quality", "standard"
                )  # openai default
                n = completion_response._hidden_params.get("optional_params", {}).get(
                    "n", 1
                )  # openai default
        else:
            if len(messages) > 0:
                prompt_tokens = token_counter(model=model, messages=messages)
            elif len(prompt) > 0:
                prompt_tokens = token_counter(model=model, text=prompt)
            completion_tokens = token_counter(model=model, text=completion)
        if model == None:
            raise ValueError(
                f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}"
            )
        if (
            call_type == CallTypes.image_generation.value
            or call_type == CallTypes.aimage_generation.value
        ):
            ### IMAGE GENERATION COST CALCULATION ###
            if custom_llm_provider == "vertex_ai":
                # https://cloud.google.com/vertex-ai/generative-ai/pricing
                # Vertex Charges Flat $0.20 per image
                return 0.020
            # fix size to match naming convention
            if "x" in size and "-x-" not in size:
                size = size.replace("x", "-x-")
            image_gen_model_name = f"{size}/{model}"
            image_gen_model_name_with_quality = image_gen_model_name
            if quality is not None:
                image_gen_model_name_with_quality = f"{quality}/{image_gen_model_name}"
            size = size.split("-x-")
            height = int(size[0])  # if it's 1024-x-1024 vs. 1024x1024
            width = int(size[1])
            verbose_logger.debug(f"image_gen_model_name: {image_gen_model_name}")
            verbose_logger.debug(
                f"image_gen_model_name_with_quality: {image_gen_model_name_with_quality}"
            )
            if image_gen_model_name in litellm.model_cost:
                return (
                    litellm.model_cost[image_gen_model_name]["input_cost_per_pixel"]
                    * height
                    * width
                    * n
                )
            elif image_gen_model_name_with_quality in litellm.model_cost:
                return (
                    litellm.model_cost[image_gen_model_name_with_quality][
                        "input_cost_per_pixel"
                    ]
                    * height
                    * width
                    * n
                )
            else:
                raise Exception(
                    f"Model={image_gen_model_name} not found in completion cost model map"
                )
        # Calculate cost based on prompt_tokens, completion_tokens
        if (
            "togethercomputer" in model
            or "together_ai" in model
            or custom_llm_provider == "together_ai"
        ):
            # together ai prices based on size of llm
            # get_model_params_and_category takes a model name and returns the category of LLM size it is in model_prices_and_context_window.json
            model = get_model_params_and_category(model)
        # replicate llms are calculate based on time for request running
        # see https://replicate.com/pricing
        elif (
            model in litellm.replicate_models or "replicate" in model
        ) and model not in litellm.model_cost:
            # for unmapped replicate model, default to replicate's time tracking logic
            return get_replicate_completion_pricing(completion_response, total_time)
        (
            prompt_tokens_cost_usd_dollar,
            completion_tokens_cost_usd_dollar,
        ) = cost_per_token(
            model=model,
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
            custom_llm_provider=custom_llm_provider,
            response_time_ms=total_time,
            region_name=region_name,
            custom_cost_per_second=custom_cost_per_second,
            custom_cost_per_token=custom_cost_per_token,
        )
        _final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
        print_verbose(
            f"final cost: {_final_cost}; prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}; completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}"
        )
        return _final_cost
    except Exception as e:
        raise e
 def supports_httpx_timeout(custom_llm_provider: str) -> bool:
    """
    Helper function to know if a provider implementation supports httpx timeout
@ -8986,6 +8765,75 @@ def exception_type(
                        response=original_exception.response,
                        litellm_debug_info=extra_information,
                    )
                elif hasattr(original_exception, "status_code"):
                    if original_exception.status_code == 500:
                        exception_mapping_worked = True
                        raise litellm.InternalServerError(
                            message=f"PredibaseException - {original_exception.message}",
                            llm_provider="predibase",
                            model=model,
                        )
                    elif original_exception.status_code == 401:
                        exception_mapping_worked = True
                        raise AuthenticationError(
                            message=f"PredibaseException - {original_exception.message}",
                            llm_provider="predibase",
                            model=model,
                        )
                    elif original_exception.status_code == 400:
                        exception_mapping_worked = True
                        raise BadRequestError(
                            message=f"PredibaseException - {original_exception.message}",
                            llm_provider="predibase",
                            model=model,
                        )
                    elif original_exception.status_code == 404:
                        exception_mapping_worked = True
                        raise NotFoundError(
                            message=f"PredibaseException - {original_exception.message}",
                            llm_provider="predibase",
                            model=model,
                        )
                    elif original_exception.status_code == 408:
                        exception_mapping_worked = True
                        raise Timeout(
                            message=f"PredibaseException - {original_exception.message}",
                            model=model,
                            llm_provider=custom_llm_provider,
                            litellm_debug_info=extra_information,
                        )
                    elif original_exception.status_code == 422:
                        exception_mapping_worked = True
                        raise BadRequestError(
                            message=f"PredibaseException - {original_exception.message}",
                            model=model,
                            llm_provider=custom_llm_provider,
                            litellm_debug_info=extra_information,
                        )
                    elif original_exception.status_code == 429:
                        exception_mapping_worked = True
                        raise RateLimitError(
                            message=f"PredibaseException - {original_exception.message}",
                            model=model,
                            llm_provider=custom_llm_provider,
                            litellm_debug_info=extra_information,
                        )
                    elif original_exception.status_code == 503:
                        exception_mapping_worked = True
                        raise ServiceUnavailableError(
                            message=f"PredibaseException - {original_exception.message}",
                            model=model,
                            llm_provider=custom_llm_provider,
                            litellm_debug_info=extra_information,
                        )
                    elif original_exception.status_code == 504:  # gateway timeout error
                        exception_mapping_worked = True
                        raise Timeout(
                            message=f"PredibaseException - {original_exception.message}",
                            model=model,
                            llm_provider=custom_llm_provider,
                            litellm_debug_info=extra_information,
                        )
            elif custom_llm_provider == "bedrock":
                if (
                    "too many tokens" in error_str
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -3009,32 +3009,37 @@
        "litellm_provider": "sagemaker",
        "mode": "chat"
    },
-    "together-ai-up-to-3b": {
+    "together-ai-up-to-4b": {
        "input_cost_per_token": 0.0000001,
        "output_cost_per_token": 0.0000001,
        "litellm_provider": "together_ai"
    },
-    "together-ai-3.1b-7b": {
+    "together-ai-4.1b-8b": {
        "input_cost_per_token": 0.0000002,
        "output_cost_per_token": 0.0000002,
        "litellm_provider": "together_ai"
    },
-    "together-ai-7.1b-20b": {
+    "together-ai-8.1b-21b": {
        "max_tokens": 1000,
-        "input_cost_per_token": 0.0000004,
+        "input_cost_per_token": 0.0000003,
-        "output_cost_per_token": 0.0000004,
+        "output_cost_per_token": 0.0000003,
        "litellm_provider": "together_ai"
    },
-    "together-ai-20.1b-40b": {
+    "together-ai-21.1b-41b": {
        "input_cost_per_token": 0.0000008,
        "output_cost_per_token": 0.0000008,
        "litellm_provider": "together_ai"
    },
-    "together-ai-40.1b-70b": {
+    "together-ai-41.1b-80b": {
        "input_cost_per_token": 0.0000009,
        "output_cost_per_token": 0.0000009,
        "litellm_provider": "together_ai"
    },
    "together-ai-81.1b-110b": {
        "input_cost_per_token": 0.0000018,
        "output_cost_per_token": 0.0000018,
        "litellm_provider": "together_ai"
    },
    "together_ai/mistralai/Mixtral-8x7B-Instruct-v0.1": {
        "input_cost_per_token": 0.0000006,
        "output_cost_per_token": 0.0000006,
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.40.7"
+version = "1.40.8"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@ -84,7 +84,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"
 [tool.commitizen]
-version = "1.40.7"
+version = "1.40.8"
 version_files = [
    "pyproject.toml:^version"
 ]
--- a/ruff.toml
+++ b/ruff.toml
@ -1,3 +1,3 @@
-ignore = ["F405"]
+ignore = ["F405", "E402"]
 extend-select = ["E501"]
 line-length = 120
--- a/ui/litellm-dashboard/src/components/chat_ui.tsx
+++ b/ui/litellm-dashboard/src/components/chat_ui.tsx
@ -119,9 +119,24 @@ const ChatUI: React.FC<ChatUIProps> = ({
          // Now, 'options' contains the list you wanted
          console.log(options); // You can log it to verify the list
-          
+
-          // setModelInfo(options) should be inside the if block to avoid setting it when no data is available
+          // if options.length > 0, only store unique values
-          setModelInfo(options);
+          if (options.length > 0) {
            const uniqueModels = Array.from(new Set(options));
            console.log("Unique models:", uniqueModels);
            // sort uniqueModels alphabetically
            uniqueModels.sort((a: any, b: any) => a.label.localeCompare(b.label));
            console.log("Model info:", modelInfo);
            // setModelInfo(options) should be inside the if block to avoid setting it when no data is available
            setModelInfo(uniqueModels);
          }
          setSelectedModel(fetchedAvailableModels.data[0].id);
        }
      } catch (error) {
--- a/ui/litellm-dashboard/src/components/model_dashboard.tsx
+++ b/ui/litellm-dashboard/src/components/model_dashboard.tsx
@ -1130,7 +1130,7 @@ const ModelDashboard: React.FC<ModelDashboardProps> = ({
                              setSelectedAPIKey(key);
                            }}
                          >
-                            ✨ {key["key_alias"]} (Enterpise only Feature) 
+                            ✨ {key["key_alias"]} (Enterprise only Feature)
                          </SelectItem>
                        );
                      }
@ -1165,7 +1165,7 @@ const ModelDashboard: React.FC<ModelDashboardProps> = ({
                      setSelectedCustomer(user);
                    }}
                  >
-                    ✨ {user} (Enterpise only Feature) 
+                    ✨ {user} (Enterprise only Feature)
                  </SelectItem>
                );
              })
--- a/ui/litellm-dashboard/src/components/navbar.tsx
+++ b/ui/litellm-dashboard/src/components/navbar.tsx
@ -114,7 +114,7 @@ const Navbar: React.FC<NavbarProps> = ({
                textDecoration: "underline",
              }}
            >
-              Get enterpise license
+              Get enterprise license
            </a>
          </div>
        ) : null}
--- a/ui/litellm-dashboard/src/components/usage.tsx
+++ b/ui/litellm-dashboard/src/components/usage.tsx
@ -832,7 +832,7 @@ const UsagePage: React.FC<UsagePageProps> = ({
                                  // @ts-ignore
                                  disabled={true} 
                                >
-                                  ✨ {tag} (Enterpise only Feature)
+                                  ✨ {tag} (Enterprise only Feature)
                                </SelectItem>
                              );
                            })}