Merge branch 'main' into litellm_selective_access_ui

2024-02-28 19:35:48 -08:00 · 2024-02-28 19:35:48 -08:00 · 35a22e2247
commit 35a22e2247
parent 36f378c8e1 07a911eaa5
55 changed files with 2284 additions and 1102 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -130,6 +130,7 @@ jobs:
            pip install "langfuse>=2.0.0"
            pip install numpydoc
            pip install prisma            
            pip install fastapi            
            pip install "httpx==0.24.1"
            pip install "gunicorn==21.2.0"
            pip install "anyio==3.7.1"
--- a/docs/my-website/docs/completion/function_call.md
+++ b/docs/my-website/docs/completion/function_call.md
@ -1,18 +1,25 @@
 # Function Calling 
 Function calling is supported with the following models on OpenAI, Azure OpenAI
- gpt-4
+## Checking if a model supports function calling 
 - gpt-4-1106-preview
 - gpt-4-0613
 - gpt-3.5-turbo
 - gpt-3.5-turbo-1106
 - gpt-3.5-turbo-0613
 - Non OpenAI LLMs (litellm adds the function call to the prompt for these llms)
-In addition, parallel function calls is supported on the following models:
+Use `litellm.supports_function_calling(model="")` -> returns `True` if model supports Function calling, `False` if not
 - gpt-4-1106-preview
 - gpt-3.5-turbo-1106
 ```python
 assert litellm.supports_function_calling(model="gpt-3.5-turbo") == True
 assert litellm.supports_function_calling(model="azure/gpt-4-1106-preview") == True
 assert litellm.supports_function_calling(model="palm/chat-bison") == False
 assert litellm.supports_function_calling(model="ollama/llama2") == False
 ```
 ## Checking if a model supports parallel function calling 
 Use `litellm.supports_parallel_function_calling(model="")` -> returns `True` if model supports parallel function calling, `False` if not
 ```python
 assert litellm.supports_parallel_function_calling(model="gpt-4-turbo-preview") == True
 assert litellm.supports_parallel_function_calling(model="gpt-4") == False
 ```
 ## Parallel Function calling
 Parallel function calling is the model's ability to perform multiple function calls together, allowing the effects and results of these function calls to be resolved in parallel
--- a/docs/my-website/docs/providers/bedrock.md
+++ b/docs/my-website/docs/providers/bedrock.md
@ -291,7 +291,6 @@ Here's an example of using a bedrock model with LiteLLM
 | Anthropic Claude-V2.1      | `completion(model='bedrock/anthropic.claude-v2:1', messages=messages)`   | `os.environ['ANTHROPIC_ACCESS_KEY_ID']`, `os.environ['ANTHROPIC_SECRET_ACCESS_KEY']`           |
 | Anthropic Claude-V2      | `completion(model='bedrock/anthropic.claude-v2', messages=messages)`   | `os.environ['ANTHROPIC_ACCESS_KEY_ID']`, `os.environ['ANTHROPIC_SECRET_ACCESS_KEY']`           |
 | Anthropic Claude-Instant V1 | `completion(model='bedrock/anthropic.claude-instant-v1', messages=messages)` | `os.environ['ANTHROPIC_ACCESS_KEY_ID']`, `os.environ['ANTHROPIC_SECRET_ACCESS_KEY']`           |
 | Anthropic Claude-V1      | `completion(model='bedrock/anthropic.claude-v1', messages=messages)`   | `os.environ['ANTHROPIC_ACCESS_KEY_ID']`, `os.environ['ANTHROPIC_SECRET_ACCESS_KEY']`           |
 | Amazon Titan Lite            | `completion(model='bedrock/amazon.titan-text-lite-v1', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` |
 | Amazon Titan Express              | `completion(model='bedrock/amazon.titan-text-express-v1', messages=messages)`   | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` |
 | Cohere Command              | `completion(model='bedrock/cohere.command-text-v14', messages=messages)`   | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` |
--- a/docs/my-website/docs/providers/vertex.md
+++ b/docs/my-website/docs/providers/vertex.md
@ -1,3 +1,6 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # VertexAI - Google [Gemini, Model Garden]
 <a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/liteLLM_VertextAI_Example.ipynb">
@ -22,8 +25,36 @@ response = litellm.completion(model="gemini-pro", messages=[{"role": "user", "co
 ## OpenAI Proxy Usage 
 Here's how to use Vertex AI with the LiteLLM Proxy Server
 1. Modify the config.yaml 
 <Tabs>
 <TabItem value="completion_param" label="Different location per model">
 Use this when you need to set a different location for each vertex model
 ```yaml
 model_list:
  - model_name: gemini-vision
    litellm_params:
      model: vertex_ai/gemini-1.0-pro-vision-001
      vertex_project: "project-id"
      vertex_location: "us-central1"
  - model_name: gemini-vision
    litellm_params:
      model: vertex_ai/gemini-1.0-pro-vision-001
      vertex_project: "project-id2"
      vertex_location: "us-east"
 ```
 </TabItem>
 <TabItem value="litellm_param" label="One location all vertex models">
 Use this when you have one vertex location for all models
 ```yaml
 litellm_settings: 
  vertex_project: "hardy-device-38811" # Your Project ID
@ -35,6 +66,10 @@ model_list:
     model: gemini-pro
 ```
 </TabItem>
 </Tabs>
 2. Start the proxy 
 ```bash
--- a/docs/my-website/docs/proxy/metrics.md
+++ b/docs/my-website/docs/proxy/metrics.md
@ -0,0 +1,44 @@
 # 💸 GET Daily Spend, Usage Metrics
 ## Request Format
 ```shell
 curl -X GET "http://0.0.0.0:4000/daily_metrics" -H "Authorization: Bearer sk-1234"
 ```
 ## Response format 
 ```json
 [
    daily_spend = [
        {
            "daily_spend": 7.9261938052047e+16,
            "day": "2024-02-01T00:00:00",
            "spend_per_model": {"azure/gpt-4": 7.9261938052047e+16},
            "spend_per_api_key": {
                "76": 914495704992000.0,
                "12": 905726697912000.0,
                "71": 866312628003000.0,
                "28": 865461799332000.0,
                "13": 859151538396000.0
            }
        },
        {
            "daily_spend": 7.938489251309491e+16,
            "day": "2024-02-02T00:00:00",
            "spend_per_model": {"gpt-3.5": 7.938489251309491e+16},
            "spend_per_api_key": {
                "91": 896805036036000.0,
                "78": 889692646082000.0,
                "49": 885386687861000.0,
                "28": 873869890984000.0,
                "56": 867398637692000.0
            }
        }
    ],
    total_spend = 200,
    top_models = {"gpt4": 0.2, "vertexai/gemini-pro":10},
    top_api_keys = {"899922": 0.9, "838hcjd999seerr88": 20}
 ]
 ```
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -40,6 +40,7 @@ const sidebars = {
        "proxy/virtual_keys",
        "proxy/users",
        "proxy/ui",
        "proxy/metrics",
        "proxy/model_management",
        "proxy/health",
        "proxy/debugging",
--- a/enterprise/utils.py
+++ b/enterprise/utils.py
@ -110,3 +110,138 @@ async def view_spend_logs_from_clickhouse(
            "log_count": num_rows,
        }
        return response_data
 def _create_clickhouse_material_views(client=None, table_names=[]):
    # Create Materialized Views if they don't exist
    # Materialized Views send new inserted rows to the aggregate tables
    verbose_logger.debug("Clickhouse: Creating Materialized Views")
    if "daily_aggregated_spend_per_model_mv" not in table_names:
        verbose_logger.debug("Clickhouse: Creating daily_aggregated_spend_per_model_mv")
        client.command(
            """
            CREATE MATERIALIZED VIEW daily_aggregated_spend_per_model_mv
            TO daily_aggregated_spend_per_model
            AS
            SELECT
                toDate(startTime) as day,
                sumState(spend) AS DailySpend,
                model as model
            FROM spend_logs
            GROUP BY
                day, model
            """
        )
    if "daily_aggregated_spend_per_api_key_mv" not in table_names:
        verbose_logger.debug(
            "Clickhouse: Creating daily_aggregated_spend_per_api_key_mv"
        )
        client.command(
            """
            CREATE MATERIALIZED VIEW daily_aggregated_spend_per_api_key_mv
            TO daily_aggregated_spend_per_api_key
            AS
            SELECT
                toDate(startTime) as day,
                sumState(spend) AS DailySpend,
                api_key as api_key
            FROM spend_logs
            GROUP BY
                day, api_key
            """
        )
    if "daily_aggregated_spend_per_user_mv" not in table_names:
        verbose_logger.debug("Clickhouse: Creating daily_aggregated_spend_per_user_mv")
        client.command(
            """
            CREATE MATERIALIZED VIEW daily_aggregated_spend_per_user_mv
            TO daily_aggregated_spend_per_user
            AS
            SELECT
                toDate(startTime) as day,
                sumState(spend) AS DailySpend,
                user as user
            FROM spend_logs
            GROUP BY
                day, user
            """
        )
    if "daily_aggregated_spend_mv" not in table_names:
        verbose_logger.debug("Clickhouse: Creating daily_aggregated_spend_mv")
        client.command(
            """
            CREATE MATERIALIZED VIEW daily_aggregated_spend_mv
            TO daily_aggregated_spend
            AS
            SELECT
                toDate(startTime) as day,
                sumState(spend) AS DailySpend
            FROM spend_logs
            GROUP BY
                day
            """
        )
 def _create_clickhouse_aggregate_tables(client=None, table_names=[]):
    # Basic Logging works without this - this is only used for low latency reporting apis
    verbose_logger.debug("Clickhouse: Creating Aggregate Tables")
    # Create Aggregeate Tables if they don't exist
    if "daily_aggregated_spend_per_model" not in table_names:
        verbose_logger.debug("Clickhouse: Creating daily_aggregated_spend_per_model")
        client.command(
            """
            CREATE TABLE daily_aggregated_spend_per_model
            (
                `day` Date,
                `DailySpend` AggregateFunction(sum, Float64),
                `model` String
            )
            ENGINE = SummingMergeTree()
            ORDER BY (day, model);
            """
        )
    if "daily_aggregated_spend_per_api_key" not in table_names:
        verbose_logger.debug("Clickhouse: Creating daily_aggregated_spend_per_api_key")
        client.command(
            """
            CREATE TABLE daily_aggregated_spend_per_api_key
            (
                `day` Date,
                `DailySpend` AggregateFunction(sum, Float64),
                `api_key` String
            )
            ENGINE = SummingMergeTree()
            ORDER BY (day, api_key);
            """
        )
    if "daily_aggregated_spend_per_user" not in table_names:
        verbose_logger.debug("Clickhouse: Creating daily_aggregated_spend_per_user")
        client.command(
            """
            CREATE TABLE daily_aggregated_spend_per_user
            (
                `day` Date,
                `DailySpend` AggregateFunction(sum, Float64),
                `user` String
            )
            ENGINE = SummingMergeTree()
            ORDER BY (day, user);
            """
        )
    if "daily_aggregated_spend" not in table_names:
        verbose_logger.debug("Clickhouse: Creating daily_aggregated_spend")
        client.command(
            """
            CREATE TABLE daily_aggregated_spend
            (
                `day` Date,
                `DailySpend` AggregateFunction(sum, Float64),
            )
            ENGINE = SummingMergeTree()
            ORDER BY (day);
            """
        )
    return
--- a/litellm/init.py
+++ b/litellm/init.py
@ -549,6 +549,8 @@ from .utils import (
    token_counter,
    cost_per_token,
    completion_cost,
    supports_function_calling,
    supports_parallel_function_calling,
    get_litellm_params,
    Logging,
    acreate,
--- a/litellm/integrations/clickhouse.py
+++ b/litellm/integrations/clickhouse.py
@ -27,6 +27,151 @@ import litellm, uuid
 from litellm._logging import print_verbose, verbose_logger
 def create_client():
    try:
        import clickhouse_connect
        port = os.getenv("CLICKHOUSE_PORT")
        clickhouse_host = os.getenv("CLICKHOUSE_HOST")
        if clickhouse_host is not None:
            verbose_logger.debug("setting up clickhouse")
            if port is not None and isinstance(port, str):
                port = int(port)
            client = clickhouse_connect.get_client(
                host=os.getenv("CLICKHOUSE_HOST"),
                port=port,
                username=os.getenv("CLICKHOUSE_USERNAME"),
                password=os.getenv("CLICKHOUSE_PASSWORD"),
            )
            return client
        else:
            raise Exception("Clickhouse: Clickhouse host not set")
    except Exception as e:
        raise ValueError(f"Clickhouse: {e}")
 def build_daily_metrics():
    click_house_client = create_client()
    # get daily spend
    daily_spend = click_house_client.query_df(
        """
        SELECT sumMerge(DailySpend) as daily_spend, day FROM daily_aggregated_spend GROUP BY day
        """
    )
    # get daily spend per model
    daily_spend_per_model = click_house_client.query_df(
        """
        SELECT sumMerge(DailySpend) as daily_spend, day, model FROM daily_aggregated_spend_per_model GROUP BY day, model
        """
    )
    new_df = daily_spend_per_model.to_dict(orient="records")
    import pandas as pd
    df = pd.DataFrame(new_df)
    # Group by 'day' and create a dictionary for each group
    result_dict = {}
    for day, group in df.groupby("day"):
        models = group["model"].tolist()
        spend = group["daily_spend"].tolist()
        spend_per_model = {model: spend for model, spend in zip(models, spend)}
        result_dict[day] = spend_per_model
    # Display the resulting dictionary
    # get daily spend per API key
    daily_spend_per_api_key = click_house_client.query_df(
        """
            SELECT
                daily_spend,
                day,
                api_key
            FROM (
                SELECT
                    sumMerge(DailySpend) as daily_spend,
                    day,
                    api_key,
                    RANK() OVER (PARTITION BY day ORDER BY sumMerge(DailySpend) DESC) as spend_rank
                FROM
                    daily_aggregated_spend_per_api_key
                GROUP BY
                    day,
                    api_key
            ) AS ranked_api_keys
            WHERE
                spend_rank <= 5
                AND day IS NOT NULL
            ORDER BY
                day,
                daily_spend DESC
        """
    )
    new_df = daily_spend_per_api_key.to_dict(orient="records")
    import pandas as pd
    df = pd.DataFrame(new_df)
    # Group by 'day' and create a dictionary for each group
    api_key_result_dict = {}
    for day, group in df.groupby("day"):
        api_keys = group["api_key"].tolist()
        spend = group["daily_spend"].tolist()
        spend_per_api_key = {api_key: spend for api_key, spend in zip(api_keys, spend)}
        api_key_result_dict[day] = spend_per_api_key
    # Display the resulting dictionary
    # Calculate total spend across all days
    total_spend = daily_spend["daily_spend"].sum()
    # Identify top models and top API keys with the highest spend across all days
    top_models = {}
    top_api_keys = {}
    for day, spend_per_model in result_dict.items():
        for model, model_spend in spend_per_model.items():
            if model not in top_models or model_spend > top_models[model]:
                top_models[model] = model_spend
    for day, spend_per_api_key in api_key_result_dict.items():
        for api_key, api_key_spend in spend_per_api_key.items():
            if api_key not in top_api_keys or api_key_spend > top_api_keys[api_key]:
                top_api_keys[api_key] = api_key_spend
    # for each day in daily spend, look up the day in result_dict and api_key_result_dict
    # Assuming daily_spend DataFrame has 'day' column
    result = []
    for index, row in daily_spend.iterrows():
        day = row["day"]
        data_day = row.to_dict()
        # Look up in result_dict
        if day in result_dict:
            spend_per_model = result_dict[day]
            # Assuming there is a column named 'model' in daily_spend
            data_day["spend_per_model"] = spend_per_model  # Assign 0 if model not found
        # Look up in api_key_result_dict
        if day in api_key_result_dict:
            spend_per_api_key = api_key_result_dict[day]
            # Assuming there is a column named 'api_key' in daily_spend
            data_day["spend_per_api_key"] = spend_per_api_key
        result.append(data_day)
    data_to_return = {}
    data_to_return["daily_spend"] = result
    data_to_return["total_spend"] = total_spend
    data_to_return["top_models"] = top_models
    data_to_return["top_api_keys"] = top_api_keys
    return data_to_return
 # build_daily_metrics()
 def _start_clickhouse():
    import clickhouse_connect
@ -86,6 +231,14 @@ def _start_clickhouse():
            response = client.query("DESCRIBE default.spend_logs")
            verbose_logger.debug(f"spend logs schema ={response.result_rows}")
        # RUN Enterprise Clickhouse Setup
        # TLDR: For Enterprise - we create views / aggregate tables for low latency reporting APIs
        from litellm.proxy.enterprise.utils import _create_clickhouse_aggregate_tables
        from litellm.proxy.enterprise.utils import _create_clickhouse_material_views
        _create_clickhouse_aggregate_tables(client=client, table_names=table_names)
        _create_clickhouse_material_views(client=client, table_names=table_names)
 class ClickhouseLogger:
    # Class variables or attributes
--- a/litellm/llms/vertex_ai.py
+++ b/litellm/llms/vertex_ai.py
@ -278,7 +278,11 @@ def completion(
        import google.auth
        ## Load credentials with the correct quota project ref: https://github.com/googleapis/python-aiplatform/issues/2557#issuecomment-1709284744
        print_verbose(
            f"VERTEX AI: vertex_project={vertex_project}; vertex_location={vertex_location}"
        )
        creds, _ = google.auth.default(quota_project_id=vertex_project)
        print_verbose(f"VERTEX AI: creds={creds}")
        vertexai.init(
            project=vertex_project, location=vertex_location, credentials=creds
        )
--- a/litellm/main.py
+++ b/litellm/main.py
@ -1467,12 +1467,14 @@ def completion(
            response = model_response
        elif custom_llm_provider == "vertex_ai":
            vertex_ai_project = (
-                optional_params.pop("vertex_ai_project", None)
+                optional_params.pop("vertex_project", None)
                or optional_params.pop("vertex_ai_project", None)
                or litellm.vertex_project
                or get_secret("VERTEXAI_PROJECT")
            )
            vertex_ai_location = (
-                optional_params.pop("vertex_ai_location", None)
+                optional_params.pop("vertex_location", None)
                or optional_params.pop("vertex_ai_location", None)
                or litellm.vertex_location
                or get_secret("VERTEXAI_LOCATION")
            )
@ -2566,12 +2568,14 @@ def embedding(
            )
        elif custom_llm_provider == "vertex_ai":
            vertex_ai_project = (
-                optional_params.pop("vertex_ai_project", None)
+                optional_params.pop("vertex_project", None)
                or optional_params.pop("vertex_ai_project", None)
                or litellm.vertex_project
                or get_secret("VERTEXAI_PROJECT")
            )
            vertex_ai_location = (
-                optional_params.pop("vertex_ai_location", None)
+                optional_params.pop("vertex_location", None)
                or optional_params.pop("vertex_ai_location", None)
                or litellm.vertex_location
                or get_secret("VERTEXAI_LOCATION")
            )
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -6,7 +6,8 @@
        "input_cost_per_token": 0.00003,
        "output_cost_per_token": 0.00006,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true
    },
    "gpt-4-turbo-preview": {
        "max_tokens": 8192, 
@ -15,7 +16,9 @@
        "input_cost_per_token": 0.00001,
        "output_cost_per_token": 0.00003,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true
    },
    "gpt-4-0314": {
        "max_tokens": 8192,
@ -33,7 +36,8 @@
        "input_cost_per_token": 0.00003,
        "output_cost_per_token": 0.00006,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true
    },
    "gpt-4-32k": {
        "max_tokens": 32768,
@ -69,7 +73,9 @@
        "input_cost_per_token": 0.00001,
        "output_cost_per_token": 0.00003,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true
    },
    "gpt-4-0125-preview": {
        "max_tokens": 128000,
@ -78,7 +84,9 @@
        "input_cost_per_token": 0.00001,
        "output_cost_per_token": 0.00003,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true
    },
    "gpt-4-vision-preview": {
        "max_tokens": 128000,
@ -105,7 +113,8 @@
        "input_cost_per_token": 0.0000015,
        "output_cost_per_token": 0.000002,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true
    },
    "gpt-3.5-turbo-0301": {
        "max_tokens": 4097,
@ -123,7 +132,8 @@
        "input_cost_per_token": 0.0000015,
        "output_cost_per_token": 0.000002,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true
    },
    "gpt-3.5-turbo-1106": {
        "max_tokens": 16385,
@ -132,7 +142,9 @@
        "input_cost_per_token": 0.0000010,
        "output_cost_per_token": 0.0000020,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true
    },
    "gpt-3.5-turbo-0125": {
        "max_tokens": 16385,
@ -141,7 +153,9 @@
        "input_cost_per_token": 0.0000005,
        "output_cost_per_token": 0.0000015,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true
    },
    "gpt-3.5-turbo-16k": {
        "max_tokens": 16385,
@ -286,7 +300,9 @@
        "input_cost_per_token": 0.00001,
        "output_cost_per_token": 0.00003,
        "litellm_provider": "azure",
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true
    },
    "azure/gpt-4-1106-preview": {
        "max_tokens": 128000,
@ -295,7 +311,9 @@
        "input_cost_per_token": 0.00001,
        "output_cost_per_token": 0.00003,
        "litellm_provider": "azure",
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true
    },
    "azure/gpt-4-0613": {
        "max_tokens": 8192,
@ -304,7 +322,8 @@
        "input_cost_per_token": 0.00003,
        "output_cost_per_token": 0.00006,
        "litellm_provider": "azure",
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true
    },
    "azure/gpt-4-32k-0613": {
        "max_tokens": 32768,
@ -331,7 +350,8 @@
        "input_cost_per_token": 0.00003,
        "output_cost_per_token": 0.00006,
        "litellm_provider": "azure",
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true
    },
    "azure/gpt-4-turbo": {
        "max_tokens": 128000,
@ -340,7 +360,9 @@
        "input_cost_per_token": 0.00001,
        "output_cost_per_token": 0.00003,
        "litellm_provider": "azure", 
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true
    },
    "azure/gpt-4-turbo-vision-preview": {
        "max_tokens": 128000,
@ -358,7 +380,8 @@
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000004,
        "litellm_provider": "azure",
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true
    },
    "azure/gpt-35-turbo-1106": {
        "max_tokens": 16384,
@ -367,7 +390,20 @@
        "input_cost_per_token": 0.0000015,
        "output_cost_per_token": 0.000002,
        "litellm_provider": "azure",
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true
    },
    "azure/gpt-35-turbo-0125": {
        "max_tokens": 16384,
        "max_input_tokens": 16384,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.0000005,
        "output_cost_per_token": 0.0000015,
        "litellm_provider": "azure",
        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true
    },
    "azure/gpt-35-turbo-16k": {
        "max_tokens": 16385,
@ -385,7 +421,8 @@
        "input_cost_per_token": 0.0000015,
        "output_cost_per_token": 0.000002,
        "litellm_provider": "azure",
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true
    },
    "azure/ada": {
        "max_tokens": 8191,
@ -514,11 +551,12 @@
        "mode": "chat"
    },
    "mistral/mistral-large-latest": {
-        "max_tokens": 8192,
+        "max_tokens": 32000,
        "input_cost_per_token": 0.000008,
        "output_cost_per_token": 0.000024,
        "litellm_provider": "mistral",
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true
    },
    "mistral/mistral-embed": {
        "max_tokens": 8192,
@ -676,7 +714,8 @@
        "input_cost_per_token": 0.00000025, 
        "output_cost_per_token": 0.0000005,
        "litellm_provider": "vertex_ai-language-models",
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true
    },
    "gemini-1.5-pro": { 
        "max_tokens": 8192,
@ -687,6 +726,15 @@
        "litellm_provider": "vertex_ai-language-models",
        "mode": "chat"
    },
    "gemini-1.5-pro-preview-0215": { 
        "max_tokens": 8192,
        "max_input_tokens": 1000000,
        "max_output_tokens": 8192,
        "input_cost_per_token": 0, 
        "output_cost_per_token": 0,
        "litellm_provider": "vertex_ai-language-models",
        "mode": "chat"
    },
    "gemini-pro-vision": {
        "max_tokens": 16384,
        "max_output_tokens": 2048,
@ -1729,6 +1777,23 @@
        "output_cost_per_token": 0.0000009,
        "litellm_provider": "together_ai"
    },
    "together_ai/mistralai/Mixtral-8x7B-Instruct-v0.1": {
        "input_cost_per_token": 0.0000006,
        "output_cost_per_token": 0.0000006,
        "litellm_provider": "together_ai",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true
    },
    "together_ai/mistralai/Mistral-7B-Instruct-v0.1": {
        "litellm_provider": "together_ai",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true
    },
    "together_ai/togethercomputer/CodeLlama-34b-Instruct": {
        "litellm_provider": "together_ai",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true
    },
    "ollama/llama2": {
        "max_tokens": 4096,
        "input_cost_per_token": 0.0,
@ -1981,7 +2046,16 @@
        "input_cost_per_token": 0.00000015, 
        "output_cost_per_token": 0.00000015,
        "litellm_provider": "anyscale", 
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true
      },
      "anyscale/Mixtral-8x7B-Instruct-v0.1": {
        "max_tokens": 16384, 
        "input_cost_per_token": 0.00000015, 
        "output_cost_per_token": 0.00000015,
        "litellm_provider": "anyscale", 
        "mode": "chat",
        "supports_function_calling": true
      },
      "anyscale/HuggingFaceH4/zephyr-7b-beta": {
        "max_tokens": 16384, 
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/page-cc9d300e3b13fc1b.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/page-cc9d300e3b13fc1b.js
--- a/litellm/proxy/_experimental/out/_next/static/h6IXdBMiZG7ES547qg1M-/_buildManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/h6IXdBMiZG7ES547qg1M-/_buildManifest.js
@ -0,0 +1 @@
 self.__BUILD_MANIFEST={__rewrites:{afterFiles:[],beforeFiles:[],fallback:[]},"/_error":["static/chunks/pages/_error-d6107f1aac0c574c.js"],sortedPages:["/_app","/_error"]},self.__BUILD_MANIFEST_CB&&self.__BUILD_MANIFEST_CB();
--- a/litellm/proxy/_experimental/out/_next/static/h6IXdBMiZG7ES547qg1M-/_ssgManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/h6IXdBMiZG7ES547qg1M-/_ssgManifest.js
@ -0,0 +1 @@
 self.__SSG_MANIFEST=new Set([]);self.__SSG_MANIFEST_CB&&self.__SSG_MANIFEST_CB()
--- a/litellm/proxy/custom_callbacks.py
+++ b/litellm/proxy/custom_callbacks.py
@ -0,0 +1,66 @@
 from litellm.integrations.custom_logger import CustomLogger
 import litellm
 # This file includes the custom callbacks for LiteLLM Proxy
 # Once defined, these can be passed in proxy_config.yaml
 class MyCustomHandler(CustomLogger):
    def log_pre_api_call(self, model, messages, kwargs):
        print(f"Pre-API Call")  # noqa
    def log_post_api_call(self, kwargs, response_obj, start_time, end_time):
        print(f"Post-API Call")  # noqa
    def log_stream_event(self, kwargs, response_obj, start_time, end_time):
        print(f"On Stream")  # noqa
    def log_success_event(self, kwargs, response_obj, start_time, end_time):
        print("On Success")  # noqa
    def log_failure_event(self, kwargs, response_obj, start_time, end_time):
        print(f"On Failure")  # noqa
    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
        print(f"ishaan async_log_success_event")  # noqa
        # log: key, user, model, prompt, response, tokens, cost
        # Access kwargs passed to litellm.completion()
        model = kwargs.get("model", None)
        messages = kwargs.get("messages", None)
        user = kwargs.get("user", None)
        # Access litellm_params passed to litellm.completion(), example access `metadata`
        litellm_params = kwargs.get("litellm_params", {})
        metadata = litellm_params.get(
            "metadata", {}
        )  # headers passed to LiteLLM proxy, can be found here
        return
    async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
        try:
            print(f"On Async Failure !")  # noqa
            print("\nkwargs", kwargs)  # noqa
            # Access kwargs passed to litellm.completion()
            model = kwargs.get("model", None)
            messages = kwargs.get("messages", None)
            user = kwargs.get("user", None)
            # Access litellm_params passed to litellm.completion(), example access `metadata`
            litellm_params = kwargs.get("litellm_params", {})
            metadata = litellm_params.get(
                "metadata", {}
            )  # headers passed to LiteLLM proxy, can be found here
            # Acess Exceptions & Traceback
            exception_event = kwargs.get("exception", None)
            traceback_event = kwargs.get("traceback_exception", None)
            # Calculate cost using  litellm.completion_cost()
        except Exception as e:
            print(f"Exception: {e}")  # noqa
 proxy_handler_instance = MyCustomHandler()
 # Set litellm.callbacks = [proxy_handler_instance] on the proxy
 # need to set litellm.callbacks = [proxy_handler_instance] # on the proxy
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@ -45,7 +45,7 @@ litellm_settings:
  fallbacks: [{"openai-gpt-3.5": ["azure-gpt-3.5"]}]
  success_callback: ['langfuse']
  # setting callback class
-  # callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance]
+  callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance]
 general_settings: 
  master_key: sk-1234
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -240,6 +240,8 @@ health_check_results = {}
 queue: List = []
 litellm_proxy_budget_name = "litellm-proxy-budget"
 ui_access_mode: Literal["admin", "all"] = "all"
 proxy_budget_rescheduler_min_time = 597
 proxy_budget_rescheduler_max_time = 605
 ### INITIALIZE GLOBAL LOGGING OBJECT ###
 proxy_logging_obj = ProxyLogging(user_api_key_cache=user_api_key_cache)
 ### REDIS QUEUE ###
@ -1407,7 +1409,7 @@ class ProxyConfig:
        """
        Load config values into proxy global state
        """
-        global master_key, user_config_file_path, otel_logging, user_custom_auth, user_custom_auth_path, user_custom_key_generate, use_background_health_checks, health_check_interval, use_queue, custom_db_client, ui_access_mode
+        global master_key, user_config_file_path, otel_logging, user_custom_auth, user_custom_auth_path, user_custom_key_generate, use_background_health_checks, health_check_interval, use_queue, custom_db_client, proxy_budget_rescheduler_max_time, proxy_budget_rescheduler_min_time, ui_access_mode
        # Load existing config
        config = await self.get_config(config_file_path=config_file_path)
@ -1718,6 +1720,13 @@ class ProxyConfig:
            ui_access_mode = general_settings.get(
                "ui_access_mode", "all"
            )  # can be either ["admin_only" or "all"]
            ## BUDGET RESCHEDULER ##
            proxy_budget_rescheduler_min_time = general_settings.get(
                "proxy_budget_rescheduler_min_time", proxy_budget_rescheduler_min_time
            )
            proxy_budget_rescheduler_max_time = general_settings.get(
                "proxy_budget_rescheduler_max_time", proxy_budget_rescheduler_max_time
            )
            ### BACKGROUND HEALTH CHECKS ###
            # Enable background health checks
            use_background_health_checks = general_settings.get(
@ -2120,10 +2129,9 @@ async def async_data_generator(response, user_api_key_dict):
    try:
        start_time = time.time()
        async for chunk in response:
-            verbose_proxy_logger.debug(f"returned chunk: {chunk}")
+            chunk = chunk.model_dump_json(exclude_none=True)
            assert isinstance(chunk, litellm.ModelResponse)
            try:
-                yield f"data: {json.dumps(chunk.model_dump(exclude_none=True))}\n\n"
+                yield f"data: {chunk}\n\n"
            except Exception as e:
                yield f"data: {str(e)}\n\n"
@ -2202,7 +2210,7 @@ def parse_cache_control(cache_control):
@router.on_event("startup")
 async def startup_event():
-    global prisma_client, master_key, use_background_health_checks, llm_router, llm_model_list, general_settings
+    global prisma_client, master_key, use_background_health_checks, llm_router, llm_model_list, general_settings, proxy_budget_rescheduler_min_time, proxy_budget_rescheduler_max_time
    import json
    ### LOAD MASTER KEY ###
@ -2307,13 +2315,12 @@ async def startup_event():
    ### CHECK IF VIEW EXISTS ###
    if prisma_client is not None:
        create_view_response = await prisma_client.check_view_exists()
        print(f"create_view_response: {create_view_response}")  # noqa
    ### START BUDGET SCHEDULER ###
    if prisma_client is not None:
        scheduler = AsyncIOScheduler()
        interval = random.randint(
-            597, 605
+            proxy_budget_rescheduler_min_time, proxy_budget_rescheduler_max_time
        )  # random interval, so multiple workers avoid resetting budget at the same time
        scheduler.add_job(
            reset_budget, "interval", seconds=interval, args=[prisma_client]
@ -3780,7 +3787,7 @@ async def view_spend_tags(
@router.get(
    "/spend/logs",
-    tags=["budget & spend Tracking"],
+    tags=["Budget & Spend Tracking"],
    dependencies=[Depends(user_api_key_auth)],
    responses={
        200: {"model": List[LiteLLM_SpendLogs]},
@ -3839,13 +3846,55 @@ async def view_spend_logs(
        # gettting spend logs from clickhouse
        from litellm.proxy.enterprise.utils import view_spend_logs_from_clickhouse
-        return await view_spend_logs_from_clickhouse(
+        daily_metrics = await view_daily_metrics(
            api_key=api_key,
            user_id=user_id,
            request_id=request_id,
            start_date=start_date,
            end_date=end_date,
        )
        # get the top api keys across all daily_metrics
        top_api_keys = {}  # type: ignore
        # make this compatible with the admin UI
        for response in daily_metrics.get("daily_spend", {}):
            response["startTime"] = response["day"]
            response["spend"] = response["daily_spend"]
            response["models"] = response["spend_per_model"]
            response["users"] = {"ishaan": 0.0}
            spend_per_api_key = response["spend_per_api_key"]
            # insert spend_per_api_key key, values in response
            for key, value in spend_per_api_key.items():
                response[key] = value
                top_api_keys[key] = top_api_keys.get(key, 0.0) + value
            del response["day"]
            del response["daily_spend"]
            del response["spend_per_model"]
            del response["spend_per_api_key"]
        # get top 5 api keys
        top_api_keys = sorted(top_api_keys.items(), key=lambda x: x[1], reverse=True)  # type: ignore
        top_api_keys = top_api_keys[:5]  # type: ignore
        top_api_keys = dict(top_api_keys)  # type: ignore
        """
        set it like this 
        {
            "key" : key,
            "spend:" : spend
        }
        """
        # we need this to show on the Admin UI
        response_keys = []
        for key in top_api_keys.items():
            response_keys.append(
                {
                    "key": key[0],
                    "spend": key[1],
                }
            )
        daily_metrics["top_api_keys"] = response_keys
        return daily_metrics
    global prisma_client
    try:
        verbose_proxy_logger.debug("inside view_spend_logs")
@ -3998,6 +4047,142 @@ async def view_spend_logs(
        )
@router.get(
    "/global/spend/logs",
    tags=["Budget & Spend Tracking"],
    dependencies=[Depends(user_api_key_auth)],
 )
 async def global_spend_logs():
    """
    [BETA] This is a beta endpoint. It will change.
    Use this to get global spend (spend per day for last 30d). Admin-only endpoint
    More efficient implementation of /spend/logs, by creating a view over the spend logs table.
    """
    global prisma_client
    sql_query = """SELECT * FROM "MonthlyGlobalSpend";"""
    response = await prisma_client.db.query_raw(query=sql_query)
    return response
@router.get(
    "/global/spend/keys",
    tags=["Budget & Spend Tracking"],
    dependencies=[Depends(user_api_key_auth)],
 )
 async def global_spend_keys(
    limit: int = fastapi.Query(
        default=None,
        description="Number of keys to get. Will return Top 'n' keys.",
    )
 ):
    """
    [BETA] This is a beta endpoint. It will change.
    Use this to get the top 'n' keys with the highest spend, ordered by spend.
    """
    global prisma_client
    if prisma_client is None:
        raise HTTPException(status_code=500, detail={"error": "No db connected"})
    sql_query = f"""SELECT * FROM "Last30dKeysBySpend" LIMIT {limit};"""
    response = await prisma_client.db.query_raw(query=sql_query)
    return response
@router.get(
    "/global/spend/models",
    tags=["Budget & Spend Tracking"],
    dependencies=[Depends(user_api_key_auth)],
 )
 async def global_spend_models(
    limit: int = fastapi.Query(
        default=None,
        description="Number of models to get. Will return Top 'n' models.",
    )
 ):
    """
    [BETA] This is a beta endpoint. It will change.
    Use this to get the top 'n' keys with the highest spend, ordered by spend.
    """
    global prisma_client
    if prisma_client is None:
        raise HTTPException(status_code=500, detail={"error": "No db connected"})
    sql_query = f"""SELECT * FROM "Last30dModelsBySpend" LIMIT {limit};"""
    response = await prisma_client.db.query_raw(query=sql_query)
    return response
@router.get(
    "/daily_metrics",
    summary="Get daily spend metrics",
    tags=["budget & spend Tracking"],
    dependencies=[Depends(user_api_key_auth)],
 )
 async def view_daily_metrics(
    start_date: Optional[str] = fastapi.Query(
        default=None,
        description="Time from which to start viewing key spend",
    ),
    end_date: Optional[str] = fastapi.Query(
        default=None,
        description="Time till which to view key spend",
    ),
 ):
    """
    [BETA] This is a beta endpoint. It might change without notice.
    Please give feedback - https://github.com/BerriAI/litellm/issues
    """
    try:
        if os.getenv("CLICKHOUSE_HOST") is not None:
            # gettting spend logs from clickhouse
            from litellm.integrations import clickhouse
            return clickhouse.build_daily_metrics()
            # create a response object
            """
            {
                "date": "2022-01-01",
                "spend": 0.0,
                "users": {},
                "models": {},
            }
            """
        else:
            raise Exception(
                "Clickhouse: Clickhouse host not set. Required for viewing /daily/metrics"
            )
    except Exception as e:
        if isinstance(e, HTTPException):
            raise ProxyException(
                message=getattr(e, "detail", f"/spend/logs Error({str(e)})"),
                type="internal_error",
                param=getattr(e, "param", "None"),
                code=getattr(e, "status_code", status.HTTP_500_INTERNAL_SERVER_ERROR),
            )
        elif isinstance(e, ProxyException):
            raise e
        raise ProxyException(
            message="/spend/logs Error" + str(e),
            type="internal_error",
            param=getattr(e, "param", "None"),
            code=status.HTTP_500_INTERNAL_SERVER_ERROR,
        )
 #### USER MANAGEMENT ####
@router.post(
    "/user/new",
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@ -489,18 +489,20 @@ class PrismaClient:
    )
    async def check_view_exists(self):
        """
-        Checks if the LiteLLM_VerificationTokenView exists in the user's db.
+        Checks if the LiteLLM_VerificationTokenView and MonthlyGlobalSpend exists in the user's db.
-        This is used for getting the token + team data in user_api_key_auth
+        LiteLLM_VerificationTokenView: This view is used for getting the token + team data in user_api_key_auth
        MonthlyGlobalSpend: This view is used for the admin view to see global spend for this month
        If the view doesn't exist, one will be created.
        """
        try:
            # Try to select one row from the view
-            await self.db.execute_raw(
+            await self.db.query_raw(
                """SELECT 1 FROM "LiteLLM_VerificationTokenView" LIMIT 1"""
            )
-            return "LiteLLM_VerificationTokenView Exists!"
+            print("LiteLLM_VerificationTokenView Exists!")  # noqa
        except Exception as e:
            # If an error occurs, the view does not exist, so create it
            value = await self.health_check()
@ -518,7 +520,29 @@ class PrismaClient:
                """
            )
-        return "LiteLLM_VerificationTokenView Created!"
+            print("LiteLLM_VerificationTokenView Created!")  # noqa
        try:
            await self.db.query_raw("""SELECT 1 FROM "MonthlyGlobalSpend" LIMIT 1""")
            print("MonthlyGlobalSpend Exists!")  # noqa
        except Exception as e:
            sql_query = """
            CREATE OR REPLACE VIEW "MonthlyGlobalSpend" AS 
            SELECT
            DATE("startTime") AS date, 
            SUM("spend") AS spend 
            FROM 
            "LiteLLM_SpendLogs" 
            WHERE 
            "startTime" >= (CURRENT_DATE - INTERVAL '30 days')
            GROUP BY 
            DATE("startTime");
            """
            await self.db.execute_raw(query=sql_query)
            print("MonthlyGlobalSpend Created!")  # noqa
        return
    @backoff.on_exception(
        backoff.expo,
--- a/litellm/tests/test_amazing_s3_logs.py
+++ b/litellm/tests/test_amazing_s3_logs.py
@ -1,253 +1,254 @@
-import sys
+## @pytest.mark.skip(reason="AWS Suspended Account")
-import os
+# import sys
-import io, asyncio
+# import os
 # import io, asyncio
-# import logging
+# # import logging
-# logging.basicConfig(level=logging.DEBUG)
+# # logging.basicConfig(level=logging.DEBUG)
-sys.path.insert(0, os.path.abspath("../.."))
+# sys.path.insert(0, os.path.abspath("../.."))
-from litellm import completion
+# from litellm import completion
-import litellm
+# import litellm
-litellm.num_retries = 3
+# litellm.num_retries = 3
-import time, random
+# import time, random
-import pytest
+# import pytest
-def test_s3_logging():
+# def test_s3_logging():
-    # all s3 requests need to be in one test function
+#     # all s3 requests need to be in one test function
-    # since we are modifying stdout, and pytests runs tests in parallel
+#     # since we are modifying stdout, and pytests runs tests in parallel
-    # on circle ci - we only test litellm.acompletion()
+#     # on circle ci - we only test litellm.acompletion()
-    try:
+#     try:
-        # redirect stdout to log_file
+#         # redirect stdout to log_file
        litellm.cache = litellm.Cache(
            type="s3", s3_bucket_name="cache-bucket-litellm", s3_region_name="us-west-2"
        )
        litellm.success_callback = ["s3"]
        litellm.s3_callback_params = {
            "s3_bucket_name": "litellm-logs",
            "s3_aws_secret_access_key": "os.environ/AWS_SECRET_ACCESS_KEY",
            "s3_aws_access_key_id": "os.environ/AWS_ACCESS_KEY_ID",
        }
        litellm.set_verbose = True
        print("Testing async s3 logging")
        expected_keys = []
        import time
        curr_time = str(time.time())
        async def _test():
            return await litellm.acompletion(
                model="gpt-3.5-turbo",
                messages=[{"role": "user", "content": f"This is a test {curr_time}"}],
                max_tokens=10,
                temperature=0.7,
                user="ishaan-2",
            )
        response = asyncio.run(_test())
        print(f"response: {response}")
        expected_keys.append(response.id)
        async def _test():
            return await litellm.acompletion(
                model="gpt-3.5-turbo",
                messages=[{"role": "user", "content": f"This is a test {curr_time}"}],
                max_tokens=10,
                temperature=0.7,
                user="ishaan-2",
            )
        response = asyncio.run(_test())
        expected_keys.append(response.id)
        print(f"response: {response}")
        time.sleep(5)  # wait 5s for logs to land
        import boto3
        s3 = boto3.client("s3")
        bucket_name = "litellm-logs"
        # List objects in the bucket
        response = s3.list_objects(Bucket=bucket_name)
        # Sort the objects based on the LastModified timestamp
        objects = sorted(
            response["Contents"], key=lambda x: x["LastModified"], reverse=True
        )
        # Get the keys of the most recent objects
        most_recent_keys = [obj["Key"] for obj in objects]
        print(most_recent_keys)
        # for each key, get the part before "-" as the key. Do it safely
        cleaned_keys = []
        for key in most_recent_keys:
            split_key = key.split("_")
            if len(split_key) < 2:
                continue
            cleaned_keys.append(split_key[1])
        print("\n most recent keys", most_recent_keys)
        print("\n cleaned keys", cleaned_keys)
        print("\n Expected keys: ", expected_keys)
        matches = 0
        for key in expected_keys:
            key += ".json"
            assert key in cleaned_keys
            if key in cleaned_keys:
                matches += 1
                # remove the match key
                cleaned_keys.remove(key)
        # this asserts we log, the first request + the 2nd cached request
        print("we had two matches ! passed ", matches)
        assert matches == 2
        try:
            # cleanup s3 bucket in test
            for key in most_recent_keys:
                s3.delete_object(Bucket=bucket_name, Key=key)
        except:
            # don't let cleanup fail a test
            pass
    except Exception as e:
        pytest.fail(f"An exception occurred - {e}")
    finally:
        # post, close log file and verify
        # Reset stdout to the original value
        print("Passed! Testing async s3 logging")
 # test_s3_logging()
 def test_s3_logging_async():
    # this tests time added to make s3 logging calls, vs just acompletion calls
    try:
        litellm.set_verbose = True
        # Make 5 calls with an empty success_callback
        litellm.success_callback = []
        start_time_empty_callback = asyncio.run(make_async_calls())
        print("done with no callback test")
        print("starting s3 logging load test")
        # Make 5 calls with success_callback set to "langfuse"
        litellm.success_callback = ["s3"]
        litellm.s3_callback_params = {
            "s3_bucket_name": "litellm-logs",
            "s3_aws_secret_access_key": "os.environ/AWS_SECRET_ACCESS_KEY",
            "s3_aws_access_key_id": "os.environ/AWS_ACCESS_KEY_ID",
        }
        start_time_s3 = asyncio.run(make_async_calls())
        print("done with s3 test")
        # Compare the time for both scenarios
        print(f"Time taken with success_callback='s3': {start_time_s3}")
        print(f"Time taken with empty success_callback: {start_time_empty_callback}")
        # assert the diff is not more than 1 second
        assert abs(start_time_s3 - start_time_empty_callback) < 1
    except litellm.Timeout as e:
        pass
    except Exception as e:
        pytest.fail(f"An exception occurred - {e}")
 async def make_async_calls():
    tasks = []
    for _ in range(5):
        task = asyncio.create_task(
            litellm.acompletion(
                model="azure/chatgpt-v-2",
                messages=[{"role": "user", "content": "This is a test"}],
                max_tokens=5,
                temperature=0.7,
                timeout=5,
                user="langfuse_latency_test_user",
                mock_response="It's simple to use and easy to get started",
            )
        )
        tasks.append(task)
    # Measure the start time before running the tasks
    start_time = asyncio.get_event_loop().time()
    # Wait for all tasks to complete
    responses = await asyncio.gather(*tasks)
    # Print the responses when tasks return
    for idx, response in enumerate(responses):
        print(f"Response from Task {idx + 1}: {response}")
    # Calculate the total time taken
    total_time = asyncio.get_event_loop().time() - start_time
    return total_time
 def test_s3_logging_r2():
    # all s3 requests need to be in one test function
    # since we are modifying stdout, and pytests runs tests in parallel
    # on circle ci - we only test litellm.acompletion()
    try:
        # redirect stdout to log_file
 #         litellm.cache = litellm.Cache(
-        #     type="s3", s3_bucket_name="litellm-r2-bucket", s3_region_name="us-west-2"
+#             type="s3", s3_bucket_name="cache-bucket-litellm", s3_region_name="us-west-2"
 #         )
        litellm.set_verbose = True
        from litellm._logging import verbose_logger
        import logging
-        verbose_logger.setLevel(level=logging.DEBUG)
+#         litellm.success_callback = ["s3"]
 #         litellm.s3_callback_params = {
 #             "s3_bucket_name": "litellm-logs",
 #             "s3_aws_secret_access_key": "os.environ/AWS_SECRET_ACCESS_KEY",
 #             "s3_aws_access_key_id": "os.environ/AWS_ACCESS_KEY_ID",
 #         }
 #         litellm.set_verbose = True
-        litellm.success_callback = ["s3"]
+#         print("Testing async s3 logging")
        litellm.s3_callback_params = {
            "s3_bucket_name": "litellm-r2-bucket",
            "s3_aws_secret_access_key": "os.environ/R2_S3_ACCESS_KEY",
            "s3_aws_access_key_id": "os.environ/R2_S3_ACCESS_ID",
            "s3_endpoint_url": "os.environ/R2_S3_URL",
            "s3_region_name": "os.environ/R2_S3_REGION_NAME",
        }
        print("Testing async s3 logging")
-        expected_keys = []
+#         expected_keys = []
-        import time
+#         import time
-        curr_time = str(time.time())
+#         curr_time = str(time.time())
-        async def _test():
+#         async def _test():
-            return await litellm.acompletion(
+#             return await litellm.acompletion(
-                model="gpt-3.5-turbo",
+#                 model="gpt-3.5-turbo",
-                messages=[{"role": "user", "content": f"This is a test {curr_time}"}],
+#                 messages=[{"role": "user", "content": f"This is a test {curr_time}"}],
-                max_tokens=10,
+#                 max_tokens=10,
-                temperature=0.7,
+#                 temperature=0.7,
-                user="ishaan-2",
+#                 user="ishaan-2",
-            )
+#             )
-        response = asyncio.run(_test())
+#         response = asyncio.run(_test())
-        print(f"response: {response}")
+#         print(f"response: {response}")
-        expected_keys.append(response.id)
+#         expected_keys.append(response.id)
-        import boto3
+#         async def _test():
 #             return await litellm.acompletion(
 #                 model="gpt-3.5-turbo",
 #                 messages=[{"role": "user", "content": f"This is a test {curr_time}"}],
 #                 max_tokens=10,
 #                 temperature=0.7,
 #                 user="ishaan-2",
 #             )
-        s3 = boto3.client(
+#         response = asyncio.run(_test())
-            "s3",
+#         expected_keys.append(response.id)
-            endpoint_url=os.getenv("R2_S3_URL"),
+#         print(f"response: {response}")
-            region_name=os.getenv("R2_S3_REGION_NAME"),
+#         time.sleep(5)  # wait 5s for logs to land
            aws_access_key_id=os.getenv("R2_S3_ACCESS_ID"),
            aws_secret_access_key=os.getenv("R2_S3_ACCESS_KEY"),
        )
-        bucket_name = "litellm-r2-bucket"
+#         import boto3
        # List objects in the bucket
        response = s3.list_objects(Bucket=bucket_name)
-    except Exception as e:
+#         s3 = boto3.client("s3")
-        pytest.fail(f"An exception occurred - {e}")
+#         bucket_name = "litellm-logs"
-    finally:
+#         # List objects in the bucket
-        # post, close log file and verify
+#         response = s3.list_objects(Bucket=bucket_name)
-        # Reset stdout to the original value
+
-        print("Passed! Testing async s3 logging")
+#         # Sort the objects based on the LastModified timestamp
 #         objects = sorted(
 #             response["Contents"], key=lambda x: x["LastModified"], reverse=True
 #         )
 #         # Get the keys of the most recent objects
 #         most_recent_keys = [obj["Key"] for obj in objects]
 #         print(most_recent_keys)
 #         # for each key, get the part before "-" as the key. Do it safely
 #         cleaned_keys = []
 #         for key in most_recent_keys:
 #             split_key = key.split("_")
 #             if len(split_key) < 2:
 #                 continue
 #             cleaned_keys.append(split_key[1])
 #         print("\n most recent keys", most_recent_keys)
 #         print("\n cleaned keys", cleaned_keys)
 #         print("\n Expected keys: ", expected_keys)
 #         matches = 0
 #         for key in expected_keys:
 #             key += ".json"
 #             assert key in cleaned_keys
 #             if key in cleaned_keys:
 #                 matches += 1
 #                 # remove the match key
 #                 cleaned_keys.remove(key)
 #         # this asserts we log, the first request + the 2nd cached request
 #         print("we had two matches ! passed ", matches)
 #         assert matches == 2
 #         try:
 #             # cleanup s3 bucket in test
 #             for key in most_recent_keys:
 #                 s3.delete_object(Bucket=bucket_name, Key=key)
 #         except:
 #             # don't let cleanup fail a test
 #             pass
 #     except Exception as e:
 #         pytest.fail(f"An exception occurred - {e}")
 #     finally:
 #         # post, close log file and verify
 #         # Reset stdout to the original value
 #         print("Passed! Testing async s3 logging")
 # # test_s3_logging()
 # def test_s3_logging_async():
 #     # this tests time added to make s3 logging calls, vs just acompletion calls
 #     try:
 #         litellm.set_verbose = True
 #         # Make 5 calls with an empty success_callback
 #         litellm.success_callback = []
 #         start_time_empty_callback = asyncio.run(make_async_calls())
 #         print("done with no callback test")
 #         print("starting s3 logging load test")
 #         # Make 5 calls with success_callback set to "langfuse"
 #         litellm.success_callback = ["s3"]
 #         litellm.s3_callback_params = {
 #             "s3_bucket_name": "litellm-logs",
 #             "s3_aws_secret_access_key": "os.environ/AWS_SECRET_ACCESS_KEY",
 #             "s3_aws_access_key_id": "os.environ/AWS_ACCESS_KEY_ID",
 #         }
 #         start_time_s3 = asyncio.run(make_async_calls())
 #         print("done with s3 test")
 #         # Compare the time for both scenarios
 #         print(f"Time taken with success_callback='s3': {start_time_s3}")
 #         print(f"Time taken with empty success_callback: {start_time_empty_callback}")
 #         # assert the diff is not more than 1 second
 #         assert abs(start_time_s3 - start_time_empty_callback) < 1
 #     except litellm.Timeout as e:
 #         pass
 #     except Exception as e:
 #         pytest.fail(f"An exception occurred - {e}")
 # async def make_async_calls():
 #     tasks = []
 #     for _ in range(5):
 #         task = asyncio.create_task(
 #             litellm.acompletion(
 #                 model="azure/chatgpt-v-2",
 #                 messages=[{"role": "user", "content": "This is a test"}],
 #                 max_tokens=5,
 #                 temperature=0.7,
 #                 timeout=5,
 #                 user="langfuse_latency_test_user",
 #                 mock_response="It's simple to use and easy to get started",
 #             )
 #         )
 #         tasks.append(task)
 #     # Measure the start time before running the tasks
 #     start_time = asyncio.get_event_loop().time()
 #     # Wait for all tasks to complete
 #     responses = await asyncio.gather(*tasks)
 #     # Print the responses when tasks return
 #     for idx, response in enumerate(responses):
 #         print(f"Response from Task {idx + 1}: {response}")
 #     # Calculate the total time taken
 #     total_time = asyncio.get_event_loop().time() - start_time
 #     return total_time
 # def test_s3_logging_r2():
 #     # all s3 requests need to be in one test function
 #     # since we are modifying stdout, and pytests runs tests in parallel
 #     # on circle ci - we only test litellm.acompletion()
 #     try:
 #         # redirect stdout to log_file
 #         # litellm.cache = litellm.Cache(
 #         #     type="s3", s3_bucket_name="litellm-r2-bucket", s3_region_name="us-west-2"
 #         # )
 #         litellm.set_verbose = True
 #         from litellm._logging import verbose_logger
 #         import logging
 #         verbose_logger.setLevel(level=logging.DEBUG)
 #         litellm.success_callback = ["s3"]
 #         litellm.s3_callback_params = {
 #             "s3_bucket_name": "litellm-r2-bucket",
 #             "s3_aws_secret_access_key": "os.environ/R2_S3_ACCESS_KEY",
 #             "s3_aws_access_key_id": "os.environ/R2_S3_ACCESS_ID",
 #             "s3_endpoint_url": "os.environ/R2_S3_URL",
 #             "s3_region_name": "os.environ/R2_S3_REGION_NAME",
 #         }
 #         print("Testing async s3 logging")
 #         expected_keys = []
 #         import time
 #         curr_time = str(time.time())
 #         async def _test():
 #             return await litellm.acompletion(
 #                 model="gpt-3.5-turbo",
 #                 messages=[{"role": "user", "content": f"This is a test {curr_time}"}],
 #                 max_tokens=10,
 #                 temperature=0.7,
 #                 user="ishaan-2",
 #             )
 #         response = asyncio.run(_test())
 #         print(f"response: {response}")
 #         expected_keys.append(response.id)
 #         import boto3
 #         s3 = boto3.client(
 #             "s3",
 #             endpoint_url=os.getenv("R2_S3_URL"),
 #             region_name=os.getenv("R2_S3_REGION_NAME"),
 #             aws_access_key_id=os.getenv("R2_S3_ACCESS_ID"),
 #             aws_secret_access_key=os.getenv("R2_S3_ACCESS_KEY"),
 #         )
 #         bucket_name = "litellm-r2-bucket"
 #         # List objects in the bucket
 #         response = s3.list_objects(Bucket=bucket_name)
 #     except Exception as e:
 #         pytest.fail(f"An exception occurred - {e}")
 #     finally:
 #         # post, close log file and verify
 #         # Reset stdout to the original value
 #         print("Passed! Testing async s3 logging")
--- a/litellm/tests/test_amazing_vertex_completion.py
+++ b/litellm/tests/test_amazing_vertex_completion.py
@ -130,6 +130,8 @@ def test_vertex_ai():
                f"response.choices[0].finish_reason: {response.choices[0].finish_reason}"
            )
            assert response.choices[0].finish_reason in litellm._openai_finish_reasons
        except litellm.RateLimitError as e:
            pass
        except Exception as e:
            pytest.fail(f"Error occurred: {e}")
@ -183,6 +185,8 @@ def test_vertex_ai_stream():
                assert type(content) == str
                # pass
            assert len(completed_str) > 4
        except litellm.RateLimitError as e:
            pass
        except Exception as e:
            pytest.fail(f"Error occurred: {e}")
--- a/litellm/tests/test_async_fn.py
+++ b/litellm/tests/test_async_fn.py
@ -193,16 +193,26 @@ async def test_hf_completion_tgi():
        # Add any assertions here to check the response
        print(response)
    except litellm.APIError as e:
        print("got an api error")
        pass
    except litellm.Timeout as e:
        print("got a timeout error")
        pass
    except litellm.RateLimitError as e:
        # this will catch the model is overloaded error
        print("got a rate limit error")
        pass
    except Exception as e:
        if "Model is overloaded" in str(e):
            pass
        else:
            pytest.fail(f"Error occurred: {e}")
 # test_get_cloudflare_response_streaming()
@pytest.mark.skip(reason="AWS Suspended Account")
@pytest.mark.asyncio
 async def test_completion_sagemaker():
    # litellm.set_verbose=True
--- a/litellm/tests/test_bedrock_completion.py
+++ b/litellm/tests/test_bedrock_completion.py
@ -1,257 +1,259 @@
-import sys, os
+# @pytest.mark.skip(reason="AWS Suspended Account")
-import traceback
+# import sys, os
-from dotenv import load_dotenv
+# import traceback
 # from dotenv import load_dotenv
-load_dotenv()
+# load_dotenv()
-import os, io
+# import os, io
-sys.path.insert(
+# sys.path.insert(
-    0, os.path.abspath("../..")
+#     0, os.path.abspath("../..")
-)  # Adds the parent directory to the system path
+# )  # Adds the parent directory to the system path
-import pytest
+# import pytest
-import litellm
+# import litellm
-from litellm import embedding, completion, completion_cost, Timeout
+# from litellm import embedding, completion, completion_cost, Timeout
-from litellm import RateLimitError
+# from litellm import RateLimitError
-# litellm.num_retries = 3
+# # litellm.num_retries = 3
-litellm.cache = None
+# litellm.cache = None
-litellm.success_callback = []
+# litellm.success_callback = []
-user_message = "Write a short poem about the sky"
+# user_message = "Write a short poem about the sky"
-messages = [{"content": user_message, "role": "user"}]
+# messages = [{"content": user_message, "role": "user"}]
-@pytest.fixture(autouse=True)
+# @pytest.fixture(autouse=True)
-def reset_callbacks():
+# def reset_callbacks():
-    print("\npytest fixture - resetting callbacks")
+#     print("\npytest fixture - resetting callbacks")
-    litellm.success_callback = []
+#     litellm.success_callback = []
-    litellm._async_success_callback = []
+#     litellm._async_success_callback = []
-    litellm.failure_callback = []
+#     litellm.failure_callback = []
-    litellm.callbacks = []
+#     litellm.callbacks = []
-def test_completion_bedrock_claude_completion_auth():
+# def test_completion_bedrock_claude_completion_auth():
-    print("calling bedrock claude completion params auth")
+#     print("calling bedrock claude completion params auth")
-    import os
+#     import os
-    aws_access_key_id = os.environ["AWS_ACCESS_KEY_ID"]
+#     aws_access_key_id = os.environ["AWS_ACCESS_KEY_ID"]
-    aws_secret_access_key = os.environ["AWS_SECRET_ACCESS_KEY"]
+#     aws_secret_access_key = os.environ["AWS_SECRET_ACCESS_KEY"]
-    aws_region_name = os.environ["AWS_REGION_NAME"]
+#     aws_region_name = os.environ["AWS_REGION_NAME"]
-    os.environ.pop("AWS_ACCESS_KEY_ID", None)
+#     os.environ.pop("AWS_ACCESS_KEY_ID", None)
-    os.environ.pop("AWS_SECRET_ACCESS_KEY", None)
+#     os.environ.pop("AWS_SECRET_ACCESS_KEY", None)
-    os.environ.pop("AWS_REGION_NAME", None)
+#     os.environ.pop("AWS_REGION_NAME", None)
-    try:
+#     try:
-        response = completion(
+#         response = completion(
-            model="bedrock/anthropic.claude-instant-v1",
+#             model="bedrock/anthropic.claude-instant-v1",
-            messages=messages,
+#             messages=messages,
-            max_tokens=10,
+#             max_tokens=10,
-            temperature=0.1,
+#             temperature=0.1,
-            aws_access_key_id=aws_access_key_id,
+#             aws_access_key_id=aws_access_key_id,
-            aws_secret_access_key=aws_secret_access_key,
+#             aws_secret_access_key=aws_secret_access_key,
-            aws_region_name=aws_region_name,
+#             aws_region_name=aws_region_name,
-        )
+#         )
-        # Add any assertions here to check the response
+#         # Add any assertions here to check the response
-        print(response)
+#         print(response)
-        os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key_id
+#         os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key_id
-        os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_access_key
+#         os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_access_key
-        os.environ["AWS_REGION_NAME"] = aws_region_name
+#         os.environ["AWS_REGION_NAME"] = aws_region_name
-    except RateLimitError:
+#     except RateLimitError:
-        pass
+#         pass
-    except Exception as e:
+#     except Exception as e:
-        pytest.fail(f"Error occurred: {e}")
+#         pytest.fail(f"Error occurred: {e}")
-# test_completion_bedrock_claude_completion_auth()
+# # test_completion_bedrock_claude_completion_auth()
-def test_completion_bedrock_claude_2_1_completion_auth():
+# def test_completion_bedrock_claude_2_1_completion_auth():
-    print("calling bedrock claude 2.1 completion params auth")
+#     print("calling bedrock claude 2.1 completion params auth")
-    import os
+#     import os
-    aws_access_key_id = os.environ["AWS_ACCESS_KEY_ID"]
+#     aws_access_key_id = os.environ["AWS_ACCESS_KEY_ID"]
-    aws_secret_access_key = os.environ["AWS_SECRET_ACCESS_KEY"]
+#     aws_secret_access_key = os.environ["AWS_SECRET_ACCESS_KEY"]
-    aws_region_name = os.environ["AWS_REGION_NAME"]
+#     aws_region_name = os.environ["AWS_REGION_NAME"]
-    os.environ.pop("AWS_ACCESS_KEY_ID", None)
+#     os.environ.pop("AWS_ACCESS_KEY_ID", None)
-    os.environ.pop("AWS_SECRET_ACCESS_KEY", None)
+#     os.environ.pop("AWS_SECRET_ACCESS_KEY", None)
-    os.environ.pop("AWS_REGION_NAME", None)
+#     os.environ.pop("AWS_REGION_NAME", None)
-    try:
+#     try:
-        response = completion(
+#         response = completion(
-            model="bedrock/anthropic.claude-v2:1",
+#             model="bedrock/anthropic.claude-v2:1",
-            messages=messages,
+#             messages=messages,
-            max_tokens=10,
+#             max_tokens=10,
-            temperature=0.1,
+#             temperature=0.1,
-            aws_access_key_id=aws_access_key_id,
+#             aws_access_key_id=aws_access_key_id,
-            aws_secret_access_key=aws_secret_access_key,
+#             aws_secret_access_key=aws_secret_access_key,
-            aws_region_name=aws_region_name,
+#             aws_region_name=aws_region_name,
-        )
+#         )
-        # Add any assertions here to check the response
+#         # Add any assertions here to check the response
-        print(response)
+#         print(response)
-        os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key_id
+#         os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key_id
-        os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_access_key
+#         os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_access_key
-        os.environ["AWS_REGION_NAME"] = aws_region_name
+#         os.environ["AWS_REGION_NAME"] = aws_region_name
-    except RateLimitError:
+#     except RateLimitError:
-        pass
+#         pass
-    except Exception as e:
+#     except Exception as e:
-        pytest.fail(f"Error occurred: {e}")
+#         pytest.fail(f"Error occurred: {e}")
-# test_completion_bedrock_claude_2_1_completion_auth()
+# # test_completion_bedrock_claude_2_1_completion_auth()
-def test_completion_bedrock_claude_external_client_auth():
+# def test_completion_bedrock_claude_external_client_auth():
-    print("\ncalling bedrock claude external client auth")
+#     print("\ncalling bedrock claude external client auth")
-    import os
+#     import os
-    aws_access_key_id = os.environ["AWS_ACCESS_KEY_ID"]
+#     aws_access_key_id = os.environ["AWS_ACCESS_KEY_ID"]
-    aws_secret_access_key = os.environ["AWS_SECRET_ACCESS_KEY"]
+#     aws_secret_access_key = os.environ["AWS_SECRET_ACCESS_KEY"]
-    aws_region_name = os.environ["AWS_REGION_NAME"]
+#     aws_region_name = os.environ["AWS_REGION_NAME"]
-    os.environ.pop("AWS_ACCESS_KEY_ID", None)
+#     os.environ.pop("AWS_ACCESS_KEY_ID", None)
-    os.environ.pop("AWS_SECRET_ACCESS_KEY", None)
+#     os.environ.pop("AWS_SECRET_ACCESS_KEY", None)
-    os.environ.pop("AWS_REGION_NAME", None)
+#     os.environ.pop("AWS_REGION_NAME", None)
-    try:
+#     try:
-        import boto3
+#         import boto3
-        litellm.set_verbose = True
+#         litellm.set_verbose = True
-        bedrock = boto3.client(
+#         bedrock = boto3.client(
-            service_name="bedrock-runtime",
+#             service_name="bedrock-runtime",
-            region_name=aws_region_name,
+#             region_name=aws_region_name,
-            aws_access_key_id=aws_access_key_id,
+#             aws_access_key_id=aws_access_key_id,
-            aws_secret_access_key=aws_secret_access_key,
+#             aws_secret_access_key=aws_secret_access_key,
-            endpoint_url=f"https://bedrock-runtime.{aws_region_name}.amazonaws.com",
+#             endpoint_url=f"https://bedrock-runtime.{aws_region_name}.amazonaws.com",
-        )
+#         )
-        response = completion(
+#         response = completion(
-            model="bedrock/anthropic.claude-instant-v1",
+#             model="bedrock/anthropic.claude-instant-v1",
-            messages=messages,
+#             messages=messages,
-            max_tokens=10,
+#             max_tokens=10,
-            temperature=0.1,
+#             temperature=0.1,
-            aws_bedrock_client=bedrock,
+#             aws_bedrock_client=bedrock,
-        )
+#         )
-        # Add any assertions here to check the response
+#         # Add any assertions here to check the response
-        print(response)
+#         print(response)
-        os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key_id
+#         os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key_id
-        os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_access_key
+#         os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_access_key
-        os.environ["AWS_REGION_NAME"] = aws_region_name
+#         os.environ["AWS_REGION_NAME"] = aws_region_name
-    except RateLimitError:
+#     except RateLimitError:
-        pass
+#         pass
-    except Exception as e:
+#     except Exception as e:
-        pytest.fail(f"Error occurred: {e}")
+#         pytest.fail(f"Error occurred: {e}")
-# test_completion_bedrock_claude_external_client_auth()
+# # test_completion_bedrock_claude_external_client_auth()
-def test_completion_bedrock_claude_sts_client_auth():
+# @pytest.mark.skip(reason="Expired token, need to renew")
-    print("\ncalling bedrock claude external client auth")
+# def test_completion_bedrock_claude_sts_client_auth():
-    import os
+#     print("\ncalling bedrock claude external client auth")
 #     import os
-    aws_access_key_id = os.environ["AWS_TEMP_ACCESS_KEY_ID"]
+#     aws_access_key_id = os.environ["AWS_TEMP_ACCESS_KEY_ID"]
-    aws_secret_access_key = os.environ["AWS_TEMP_SECRET_ACCESS_KEY"]
+#     aws_secret_access_key = os.environ["AWS_TEMP_SECRET_ACCESS_KEY"]
-    aws_region_name = os.environ["AWS_REGION_NAME"]
+#     aws_region_name = os.environ["AWS_REGION_NAME"]
-    aws_role_name = os.environ["AWS_TEMP_ROLE_NAME"]
+#     aws_role_name = os.environ["AWS_TEMP_ROLE_NAME"]
-    try:
+#     try:
-        import boto3
+#         import boto3
-        litellm.set_verbose = True
+#         litellm.set_verbose = True
-        response = completion(
+#         response = completion(
-            model="bedrock/anthropic.claude-instant-v1",
+#             model="bedrock/anthropic.claude-instant-v1",
-            messages=messages,
+#             messages=messages,
-            max_tokens=10,
+#             max_tokens=10,
-            temperature=0.1,
+#             temperature=0.1,
-            aws_region_name=aws_region_name,
+#             aws_region_name=aws_region_name,
-            aws_access_key_id=aws_access_key_id,
+#             aws_access_key_id=aws_access_key_id,
-            aws_secret_access_key=aws_secret_access_key,
+#             aws_secret_access_key=aws_secret_access_key,
-            aws_role_name=aws_role_name,
+#             aws_role_name=aws_role_name,
-            aws_session_name="my-test-session",
+#             aws_session_name="my-test-session",
-        )
+#         )
-        response = embedding(
+#         response = embedding(
-            model="cohere.embed-multilingual-v3",
+#             model="cohere.embed-multilingual-v3",
-            input=["hello world"],
+#             input=["hello world"],
-            aws_region_name="us-east-1",
+#             aws_region_name="us-east-1",
-            aws_access_key_id=aws_access_key_id,
+#             aws_access_key_id=aws_access_key_id,
-            aws_secret_access_key=aws_secret_access_key,
+#             aws_secret_access_key=aws_secret_access_key,
-            aws_role_name=aws_role_name,
+#             aws_role_name=aws_role_name,
-            aws_session_name="my-test-session",
+#             aws_session_name="my-test-session",
-        )
+#         )
-        response = completion(
+#         response = completion(
-            model="gpt-3.5-turbo",
+#             model="gpt-3.5-turbo",
-            messages=messages,
+#             messages=messages,
-            aws_region_name="us-east-1",
+#             aws_region_name="us-east-1",
-            aws_access_key_id=aws_access_key_id,
+#             aws_access_key_id=aws_access_key_id,
-            aws_secret_access_key=aws_secret_access_key,
+#             aws_secret_access_key=aws_secret_access_key,
-            aws_role_name=aws_role_name,
+#             aws_role_name=aws_role_name,
-            aws_session_name="my-test-session",
+#             aws_session_name="my-test-session",
-        )
+#         )
-        # Add any assertions here to check the response
+#         # Add any assertions here to check the response
-        print(response)
+#         print(response)
-    except RateLimitError:
+#     except RateLimitError:
-        pass
+#         pass
-    except Exception as e:
+#     except Exception as e:
-        pytest.fail(f"Error occurred: {e}")
+#         pytest.fail(f"Error occurred: {e}")
-test_completion_bedrock_claude_sts_client_auth()
+# # test_completion_bedrock_claude_sts_client_auth()
-def test_provisioned_throughput():
+# def test_provisioned_throughput():
-    try:
+#     try:
-        litellm.set_verbose = True
+#         litellm.set_verbose = True
-        import botocore, json, io
+#         import botocore, json, io
-        import botocore.session
+#         import botocore.session
-        from botocore.stub import Stubber
+#         from botocore.stub import Stubber
-        bedrock_client = botocore.session.get_session().create_client(
+#         bedrock_client = botocore.session.get_session().create_client(
-            "bedrock-runtime", region_name="us-east-1"
+#             "bedrock-runtime", region_name="us-east-1"
-        )
+#         )
-        expected_params = {
+#         expected_params = {
-            "accept": "application/json",
+#             "accept": "application/json",
-            "body": '{"prompt": "\\n\\nHuman: Hello, how are you?\\n\\nAssistant: ", '
+#             "body": '{"prompt": "\\n\\nHuman: Hello, how are you?\\n\\nAssistant: ", '
-            '"max_tokens_to_sample": 256}',
+#             '"max_tokens_to_sample": 256}',
-            "contentType": "application/json",
+#             "contentType": "application/json",
-            "modelId": "provisioned-model-arn",
+#             "modelId": "provisioned-model-arn",
-        }
+#         }
-        response_from_bedrock = {
+#         response_from_bedrock = {
-            "body": io.StringIO(
+#             "body": io.StringIO(
-                json.dumps(
+#                 json.dumps(
-                    {
+#                     {
-                        "completion": " Here is a short poem about the sky:",
+#                         "completion": " Here is a short poem about the sky:",
-                        "stop_reason": "max_tokens",
+#                         "stop_reason": "max_tokens",
-                        "stop": None,
+#                         "stop": None,
-                    }
+#                     }
-                )
+#                 )
-            ),
+#             ),
-            "contentType": "contentType",
+#             "contentType": "contentType",
-            "ResponseMetadata": {"HTTPStatusCode": 200},
+#             "ResponseMetadata": {"HTTPStatusCode": 200},
-        }
+#         }
-        with Stubber(bedrock_client) as stubber:
+#         with Stubber(bedrock_client) as stubber:
-            stubber.add_response(
+#             stubber.add_response(
-                "invoke_model",
+#                 "invoke_model",
-                service_response=response_from_bedrock,
+#                 service_response=response_from_bedrock,
-                expected_params=expected_params,
+#                 expected_params=expected_params,
-            )
+#             )
-            response = litellm.completion(
+#             response = litellm.completion(
-                model="bedrock/anthropic.claude-instant-v1",
+#                 model="bedrock/anthropic.claude-instant-v1",
-                model_id="provisioned-model-arn",
+#                 model_id="provisioned-model-arn",
-                messages=[{"content": "Hello, how are you?", "role": "user"}],
+#                 messages=[{"content": "Hello, how are you?", "role": "user"}],
-                aws_bedrock_client=bedrock_client,
+#                 aws_bedrock_client=bedrock_client,
-            )
+#             )
-            print("response stubbed", response)
+#             print("response stubbed", response)
-    except Exception as e:
+#     except Exception as e:
-        pytest.fail(f"Error occurred: {e}")
+#         pytest.fail(f"Error occurred: {e}")
-# test_provisioned_throughput()
+# # test_provisioned_throughput()
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@ -546,6 +546,7 @@ def test_redis_cache_acompletion_stream():
 # test_redis_cache_acompletion_stream()
@pytest.mark.skip(reason="AWS Suspended Account")
 def test_redis_cache_acompletion_stream_bedrock():
    import asyncio
@ -571,7 +572,7 @@ def test_redis_cache_acompletion_stream_bedrock():
        async def call1():
            nonlocal response_1_content
            response1 = await litellm.acompletion(
-                model="bedrock/anthropic.claude-v1",
+                model="bedrock/anthropic.claude-v2",
                messages=messages,
                max_tokens=40,
                temperature=1,
@ -589,7 +590,7 @@ def test_redis_cache_acompletion_stream_bedrock():
        async def call2():
            nonlocal response_2_content
            response2 = await litellm.acompletion(
-                model="bedrock/anthropic.claude-v1",
+                model="bedrock/anthropic.claude-v2",
                messages=messages,
                max_tokens=40,
                temperature=1,
@ -615,6 +616,7 @@ def test_redis_cache_acompletion_stream_bedrock():
        raise e
@pytest.mark.skip(reason="AWS Suspended Account")
 def test_s3_cache_acompletion_stream_azure():
    import asyncio
@ -697,6 +699,7 @@ def test_s3_cache_acompletion_stream_azure():
@pytest.mark.asyncio
@pytest.mark.skip(reason="AWS Suspended Account")
 async def test_s3_cache_acompletion_azure():
    import asyncio
    import logging
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -1404,6 +1404,7 @@ def test_customprompt_together_ai():
 # test_customprompt_together_ai()
@pytest.mark.skip(reason="AWS Suspended Account")
 def test_completion_sagemaker():
    try:
        litellm.set_verbose = True
@ -1429,6 +1430,7 @@ def test_completion_sagemaker():
 # test_completion_sagemaker()
@pytest.mark.skip(reason="AWS Suspended Account")
 def test_completion_sagemaker_stream():
    try:
        litellm.set_verbose = False
@ -1459,6 +1461,7 @@ def test_completion_sagemaker_stream():
        pytest.fail(f"Error occurred: {e}")
@pytest.mark.skip(reason="AWS Suspended Account")
 def test_completion_chat_sagemaker():
    try:
        messages = [{"role": "user", "content": "Hey, how's it going?"}]
@ -1483,6 +1486,7 @@ def test_completion_chat_sagemaker():
 # test_completion_chat_sagemaker()
@pytest.mark.skip(reason="AWS Suspended Account")
 def test_completion_chat_sagemaker_mistral():
    try:
        messages = [{"role": "user", "content": "Hey, how's it going?"}]
@ -1501,6 +1505,7 @@ def test_completion_chat_sagemaker_mistral():
 # test_completion_chat_sagemaker_mistral()
@pytest.mark.skip(reason="AWS Suspended Account")
 def test_completion_bedrock_titan_null_response():
    try:
        response = completion(
@ -1526,6 +1531,7 @@ def test_completion_bedrock_titan_null_response():
        pytest.fail(f"An error occurred - {str(e)}")
@pytest.mark.skip(reason="AWS Suspended Account")
 def test_completion_bedrock_titan():
    try:
        response = completion(
@ -1547,6 +1553,7 @@ def test_completion_bedrock_titan():
 # test_completion_bedrock_titan()
@pytest.mark.skip(reason="AWS Suspended Account")
 def test_completion_bedrock_claude():
    print("calling claude")
    try:
@ -1568,6 +1575,7 @@ def test_completion_bedrock_claude():
 # test_completion_bedrock_claude()
@pytest.mark.skip(reason="AWS Suspended Account")
 def test_completion_bedrock_cohere():
    print("calling bedrock cohere")
    litellm.set_verbose = True
@ -1954,11 +1962,14 @@ def test_completion_gemini():
    messages = [{"role": "user", "content": "Hey, how's it going?"}]
    try:
        response = completion(model=model_name, messages=messages)
-        # Add any assertions here to check the response
+        # Add any assertions,here to check the response
        print(response)
    except litellm.APIError as e:
        pass
    except Exception as e:
        if "InternalServerError" in str(e):
            pass
        else:
            pytest.fail(f"Error occurred: {e}")
@ -1974,7 +1985,12 @@ async def test_acompletion_gemini():
        response = await litellm.acompletion(model=model_name, messages=messages)
        # Add any assertions here to check the response
        print(f"response: {response}")
    except litellm.APIError as e:
        pass
    except Exception as e:
        if "InternalServerError" in str(e):
            pass
        else:
            pytest.fail(f"Error occurred: {e}")
--- a/litellm/tests/test_completion_cost.py
+++ b/litellm/tests/test_completion_cost.py
@ -171,6 +171,7 @@ def test_cost_openai_image_gen():
    assert cost == 0.019922944
@pytest.mark.skip(reason="AWS Suspended Account")
 def test_cost_bedrock_pricing():
    """
    - get pricing specific to region for a model
@ -226,6 +227,7 @@ def test_cost_bedrock_pricing():
    assert cost == predicted_cost
@pytest.mark.skip(reason="AWS disabled our access")
 def test_cost_bedrock_pricing_actual_calls():
    litellm.set_verbose = True
    model = "anthropic.claude-instant-v1"
--- a/litellm/tests/test_configs/test_config_no_auth.yaml
+++ b/litellm/tests/test_configs/test_config_no_auth.yaml
@ -80,16 +80,6 @@ model_list:
    description: this is a test openai model
    id: 9b1ef341-322c-410a-8992-903987fef439
  model_name: test_openai_models
 - litellm_params:
    model: bedrock/amazon.titan-embed-text-v1
  model_info:
    mode: embedding
  model_name: amazon-embeddings
 - litellm_params:
    model: sagemaker/berri-benchmarking-gpt-j-6b-fp16
  model_info:
    mode: embedding
  model_name: GPT-J 6B - Sagemaker Text Embedding (Internal)
 - litellm_params:
    model: dall-e-3
  model_info:
--- a/litellm/tests/test_custom_callback_input.py
+++ b/litellm/tests/test_custom_callback_input.py
@ -478,17 +478,18 @@ async def test_async_chat_azure_stream():
 ## Test Bedrock + sync
@pytest.mark.skip(reason="AWS Suspended Account")
 def test_chat_bedrock_stream():
    try:
        customHandler = CompletionCustomHandler()
        litellm.callbacks = [customHandler]
        response = litellm.completion(
-            model="bedrock/anthropic.claude-v1",
+            model="bedrock/anthropic.claude-v2",
            messages=[{"role": "user", "content": "Hi 👋 - i'm sync bedrock"}],
        )
        # test streaming
        response = litellm.completion(
-            model="bedrock/anthropic.claude-v1",
+            model="bedrock/anthropic.claude-v2",
            messages=[{"role": "user", "content": "Hi 👋 - i'm sync bedrock"}],
            stream=True,
        )
@ -497,7 +498,7 @@ def test_chat_bedrock_stream():
        # test failure callback
        try:
            response = litellm.completion(
-                model="bedrock/anthropic.claude-v1",
+                model="bedrock/anthropic.claude-v2",
                messages=[{"role": "user", "content": "Hi 👋 - i'm sync bedrock"}],
                aws_region_name="my-bad-region",
                stream=True,
@ -518,18 +519,19 @@ def test_chat_bedrock_stream():
 ## Test Bedrock + Async
@pytest.mark.skip(reason="AWS Suspended Account")
@pytest.mark.asyncio
 async def test_async_chat_bedrock_stream():
    try:
        customHandler = CompletionCustomHandler()
        litellm.callbacks = [customHandler]
        response = await litellm.acompletion(
-            model="bedrock/anthropic.claude-v1",
+            model="bedrock/anthropic.claude-v2",
            messages=[{"role": "user", "content": "Hi 👋 - i'm async bedrock"}],
        )
        # test streaming
        response = await litellm.acompletion(
-            model="bedrock/anthropic.claude-v1",
+            model="bedrock/anthropic.claude-v2",
            messages=[{"role": "user", "content": "Hi 👋 - i'm async bedrock"}],
            stream=True,
        )
@ -540,7 +542,7 @@ async def test_async_chat_bedrock_stream():
        ## test failure callback
        try:
            response = await litellm.acompletion(
-                model="bedrock/anthropic.claude-v1",
+                model="bedrock/anthropic.claude-v2",
                messages=[{"role": "user", "content": "Hi 👋 - i'm async bedrock"}],
                aws_region_name="my-bad-key",
                stream=True,
@ -561,6 +563,7 @@ async def test_async_chat_bedrock_stream():
 ## Test Sagemaker + Async
@pytest.mark.skip(reason="AWS Suspended Account")
@pytest.mark.asyncio
 async def test_async_chat_sagemaker_stream():
    try:
@ -793,6 +796,7 @@ async def test_async_embedding_azure():
 ## Test Bedrock + Async
@pytest.mark.skip(reason="AWS Suspended Account")
@pytest.mark.asyncio
 async def test_async_embedding_bedrock():
    try:
--- a/litellm/tests/test_custom_logger.py
+++ b/litellm/tests/test_custom_logger.py
@ -388,6 +388,7 @@ async def test_async_custom_handler_embedding_optional_param():
 # asyncio.run(test_async_custom_handler_embedding_optional_param())
@pytest.mark.skip(reason="AWS Account suspended. Pending their approval")
@pytest.mark.asyncio
 async def test_async_custom_handler_embedding_optional_param_bedrock():
    """
--- a/litellm/tests/test_dynamodb_logs.py
+++ b/litellm/tests/test_dynamodb_logs.py
@ -67,6 +67,7 @@ def verify_log_file(log_file_path):
    assert success_count == 3  # Expect 3 success logs from dynamoDB
@pytest.mark.skip(reason="AWS Suspended Account")
 def test_dynamo_logging():
    # all dynamodb requests need to be in one test function
    # since we are modifying stdout, and pytests runs tests in parallel
--- a/litellm/tests/test_embedding.py
+++ b/litellm/tests/test_embedding.py
@ -256,6 +256,7 @@ async def test_vertexai_aembedding():
        pytest.fail(f"Error occurred: {e}")
@pytest.mark.skip(reason="AWS Suspended Account")
 def test_bedrock_embedding_titan():
    try:
        # this tests if we support str input for bedrock embedding
@ -301,6 +302,7 @@ def test_bedrock_embedding_titan():
 # test_bedrock_embedding_titan()
@pytest.mark.skip(reason="AWS Suspended Account")
 def test_bedrock_embedding_cohere():
    try:
        litellm.set_verbose = False
@ -422,6 +424,7 @@ def test_aembedding_azure():
 # test_aembedding_azure()
@pytest.mark.skip(reason="AWS Suspended Account")
 def test_sagemaker_embeddings():
    try:
        response = litellm.embedding(
@ -438,6 +441,7 @@ def test_sagemaker_embeddings():
        pytest.fail(f"Error occurred: {e}")
@pytest.mark.skip(reason="AWS Suspended Account")
@pytest.mark.asyncio
 async def test_sagemaker_aembeddings():
    try:
--- a/litellm/tests/test_exceptions.py
+++ b/litellm/tests/test_exceptions.py
@ -42,6 +42,7 @@ exception_models = [
 # Test 1: Context Window Errors
@pytest.mark.skip(reason="AWS Suspended Account")
@pytest.mark.parametrize("model", exception_models)
 def test_context_window(model):
    print("Testing context window error")
@ -120,9 +121,9 @@ def invalid_auth(model):  # set the model key to an invalid key, depending on th
            os.environ["AI21_API_KEY"] = "bad-key"
        elif "togethercomputer" in model:
            temporary_key = os.environ["TOGETHERAI_API_KEY"]
-            os.environ[
+            os.environ["TOGETHERAI_API_KEY"] = (
-                "TOGETHERAI_API_KEY"
+                "84060c79880fc49df126d3e87b53f8a463ff6e1c6d27fe64207cde25cdfcd1f24a"
-            ] = "84060c79880fc49df126d3e87b53f8a463ff6e1c6d27fe64207cde25cdfcd1f24a"
+            )
        elif model in litellm.openrouter_models:
            temporary_key = os.environ["OPENROUTER_API_KEY"]
            os.environ["OPENROUTER_API_KEY"] = "bad-key"
--- a/litellm/tests/test_health_check.py
+++ b/litellm/tests/test_health_check.py
@ -87,6 +87,7 @@ async def test_azure_img_gen_health_check():
 # asyncio.run(test_azure_img_gen_health_check())
@pytest.mark.skip(reason="AWS Suspended Account")
@pytest.mark.asyncio
 async def test_sagemaker_embedding_health_check():
    response = await litellm.ahealth_check(
--- a/litellm/tests/test_image_generation.py
+++ b/litellm/tests/test_image_generation.py
@ -121,6 +121,7 @@ async def test_async_image_generation_azure():
            pytest.fail(f"An exception occurred - {str(e)}")
@pytest.mark.skip(reason="AWS Suspended Account")
 def test_image_generation_bedrock():
    try:
        litellm.set_verbose = True
@ -141,6 +142,7 @@ def test_image_generation_bedrock():
            pytest.fail(f"An exception occurred - {str(e)}")
@pytest.mark.skip(reason="AWS Suspended Account")
@pytest.mark.asyncio
 async def test_aimage_generation_bedrock_with_optional_params():
    try:
--- a/litellm/tests/test_key_generate_dynamodb.py
+++ b/litellm/tests/test_key_generate_dynamodb.py
--- a/litellm/tests/test_key_generate_prisma.py
+++ b/litellm/tests/test_key_generate_prisma.py
@ -80,6 +80,14 @@ request_data = {
@pytest.fixture
 def prisma_client():
    from litellm.proxy.proxy_cli import append_query_params
    ### add connection pool + pool timeout args
    params = {"connection_limit": 100, "pool_timeout": 60}
    database_url = os.getenv("DATABASE_URL")
    modified_url = append_query_params(database_url, params)
    os.environ["DATABASE_URL"] = modified_url
    # Assuming DBClient is a class that needs to be instantiated
    prisma_client = PrismaClient(
        database_url=os.environ["DATABASE_URL"], proxy_logging_obj=proxy_logging_obj
@ -1633,3 +1641,99 @@ async def test_key_with_no_permissions(prisma_client):
    except Exception as e:
        print("Got Exception", e)
        print(e.message)
 async def track_cost_callback_helper_fn(generated_key: str, user_id: str):
    from litellm import ModelResponse, Choices, Message, Usage
    from litellm.proxy.proxy_server import (
        _PROXY_track_cost_callback as track_cost_callback,
    )
    import uuid
    request_id = f"chatcmpl-e41836bb-bb8b-4df2-8e70-8f3e160155ac{uuid.uuid4()}"
    resp = ModelResponse(
        id=request_id,
        choices=[
            Choices(
                finish_reason=None,
                index=0,
                message=Message(
                    content=" Sure! Here is a short poem about the sky:\n\nA canvas of blue, a",
                    role="assistant",
                ),
            )
        ],
        model="gpt-35-turbo",  # azure always has model written like this
        usage=Usage(prompt_tokens=210, completion_tokens=200, total_tokens=410),
    )
    await track_cost_callback(
        kwargs={
            "call_type": "acompletion",
            "model": "sagemaker-chatgpt-v-2",
            "stream": True,
            "complete_streaming_response": resp,
            "litellm_params": {
                "metadata": {
                    "user_api_key": hash_token(generated_key),
                    "user_api_key_user_id": user_id,
                }
            },
            "response_cost": 0.00005,
        },
        completion_response=resp,
        start_time=datetime.now(),
        end_time=datetime.now(),
    )
@pytest.mark.skip(reason="High traffic load test for spend tracking")
@pytest.mark.asyncio
 async def test_proxy_load_test_db(prisma_client):
    """
    Run 1500 req./s against track_cost_callback function
    """
    setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
    from litellm._logging import verbose_proxy_logger
    import logging, time
    litellm.set_verbose = True
    verbose_proxy_logger.setLevel(logging.DEBUG)
    try:
        start_time = time.time()
        await litellm.proxy.proxy_server.prisma_client.connect()
        request = GenerateKeyRequest(max_budget=0.00001)
        key = await generate_key_fn(request)
        print(key)
        generated_key = key.key
        user_id = key.user_id
        bearer_token = "Bearer " + generated_key
        request = Request(scope={"type": "http"})
        request._url = URL(url="/chat/completions")
        # use generated key to auth in
        result = await user_api_key_auth(request=request, api_key=bearer_token)
        print("result from user auth with new key", result)
        # update spend using track_cost callback, make 2nd request, it should fail
        n = 5000
        tasks = [
            track_cost_callback_helper_fn(generated_key=generated_key, user_id=user_id)
            for _ in range(n)
        ]
        completions = await asyncio.gather(*tasks)
        await asyncio.sleep(120)
        try:
            # call spend logs
            spend_logs = await view_spend_logs(api_key=generated_key)
            print(f"len responses: {len(spend_logs)}")
            assert len(spend_logs) == n
            print(n, time.time() - start_time, len(spend_logs))
        except:
            print(n, time.time() - start_time, 0)
        raise Exception(f"it worked! key={key.key}")
    except Exception as e:
        pytest.fail(f"An exception occurred - {str(e)}")
--- a/litellm/tests/test_model_max_token_adjust.py
+++ b/litellm/tests/test_model_max_token_adjust.py
@ -12,6 +12,7 @@ import litellm
 from litellm import completion
@pytest.mark.skip(reason="AWS Suspended Account")
 def test_completion_sagemaker():
    litellm.set_verbose = True
    litellm.drop_params = True
--- a/litellm/tests/test_provider_specific_config.py
+++ b/litellm/tests/test_provider_specific_config.py
@ -473,6 +473,7 @@ def aleph_alpha_test_completion():
 #  Sagemaker
@pytest.mark.skip(reason="AWS Suspended Account")
 def sagemaker_test_completion():
    litellm.SagemakerConfig(max_new_tokens=10)
    # litellm.set_verbose=True
@ -514,6 +515,7 @@ def sagemaker_test_completion():
 #  Bedrock
@pytest.mark.skip(reason="AWS Suspended Account")
 def bedrock_test_completion():
    litellm.AmazonCohereConfig(max_tokens=10)
    # litellm.set_verbose=True
--- a/litellm/tests/test_proxy_server.py
+++ b/litellm/tests/test_proxy_server.py
@ -125,6 +125,7 @@ def test_embedding(client_no_auth):
        pytest.fail(f"LiteLLM Proxy test failed. Exception - {str(e)}")
@pytest.mark.skip(reason="AWS Suspended Account")
 def test_bedrock_embedding(client_no_auth):
    global headers
    from litellm.proxy.proxy_server import user_custom_auth
@ -145,6 +146,7 @@ def test_bedrock_embedding(client_no_auth):
        pytest.fail(f"LiteLLM Proxy test failed. Exception - {str(e)}")
@pytest.mark.skip(reason="AWS Suspended Account")
 def test_sagemaker_embedding(client_no_auth):
    global headers
    from litellm.proxy.proxy_server import user_custom_auth
--- a/litellm/tests/test_proxy_server_caching.py
+++ b/litellm/tests/test_proxy_server_caching.py
@ -61,6 +61,7 @@ def generate_random_word(length=4):
    return "".join(random.choice(letters) for _ in range(length))
@pytest.mark.skip(reason="AWS Suspended Account")
 def test_chat_completion(client_no_auth):
    global headers
    try:
--- a/litellm/tests/test_router.py
+++ b/litellm/tests/test_router.py
@ -166,14 +166,6 @@ def test_call_one_endpoint():
                "tpm": 240000,
                "rpm": 1800,
            },
            {
                "model_name": "claude-v1",
                "litellm_params": {
                    "model": "bedrock/anthropic.claude-instant-v1",
                },
                "tpm": 100000,
                "rpm": 10000,
            },
            {
                "model_name": "text-embedding-ada-002",
                "litellm_params": {
@ -202,15 +194,6 @@ def test_call_one_endpoint():
            )
            print("\n response", response)
        async def call_bedrock_claude():
            response = await router.acompletion(
                model="bedrock/anthropic.claude-instant-v1",
                messages=[{"role": "user", "content": "hello this request will pass"}],
                specific_deployment=True,
            )
            print("\n response", response)
        async def call_azure_embedding():
            response = await router.aembedding(
                model="azure/azure-embedding-model",
@ -221,7 +204,6 @@ def test_call_one_endpoint():
            print("\n response", response)
        asyncio.run(call_azure_completion())
        asyncio.run(call_bedrock_claude())
        asyncio.run(call_azure_embedding())
        os.environ["AZURE_API_BASE"] = old_api_base
@ -593,6 +575,7 @@ def test_azure_embedding_on_router():
 # test_azure_embedding_on_router()
@pytest.mark.skip(reason="AWS Suspended Account")
 def test_bedrock_on_router():
    litellm.set_verbose = True
    print("\n Testing bedrock on router\n")
--- a/litellm/tests/test_router_timeout.py
+++ b/litellm/tests/test_router_timeout.py
@ -87,6 +87,7 @@ def test_router_timeouts():
        print("********** TOKENS USED SO FAR = ", total_tokens_used)
@pytest.mark.skip(reason="AWS Suspended Account")
@pytest.mark.asyncio
 async def test_router_timeouts_bedrock():
    import openai
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@ -764,6 +764,7 @@ def test_completion_replicate_stream_bad_key():
 # test_completion_replicate_stream_bad_key()
@pytest.mark.skip(reason="AWS Suspended Account")
 def test_completion_bedrock_claude_stream():
    try:
        litellm.set_verbose = False
@ -810,6 +811,7 @@ def test_completion_bedrock_claude_stream():
 # test_completion_bedrock_claude_stream()
@pytest.mark.skip(reason="AWS Suspended Account")
 def test_completion_bedrock_ai21_stream():
    try:
        litellm.set_verbose = False
@ -911,6 +913,7 @@ def test_sagemaker_weird_response():
 # test_sagemaker_weird_response()
@pytest.mark.skip(reason="AWS Suspended Account")
@pytest.mark.asyncio
 async def test_sagemaker_streaming_async():
    try:
@ -949,6 +952,7 @@ async def test_sagemaker_streaming_async():
 # asyncio.run(test_sagemaker_streaming_async())
@pytest.mark.skip(reason="AWS Suspended Account")
 def test_completion_sagemaker_stream():
    try:
        response = completion(
@ -1075,8 +1079,6 @@ async def test_hf_completion_tgi_stream():
            if finished:
                break
            idx += 1
        if complete_response.strip() == "":
            raise Exception("Empty response received")
        print(f"completion_response: {complete_response}")
    except litellm.ServiceUnavailableError as e:
        pass
--- a/litellm/tests/test_utils.py
+++ b/litellm/tests/test_utils.py
@ -317,3 +317,24 @@ def test_token_counter():
 # test_token_counter()
 def test_supports_function_calling():
    try:
        assert litellm.supports_function_calling(model="gpt-3.5-turbo") == True
        assert (
            litellm.supports_function_calling(model="azure/gpt-4-1106-preview") == True
        )
        assert (
            litellm.supports_function_calling(model="anthropic.claude-instant-v1")
            == False
        )
        assert litellm.supports_function_calling(model="palm/chat-bison") == False
        assert litellm.supports_function_calling(model="ollama/llama2") == False
        assert (
            litellm.supports_function_calling(model="anthropic.claude-instant-v1")
            == False
        )
        assert litellm.supports_function_calling(model="claude-2") == False
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -205,18 +205,18 @@ def map_finish_reason(
 class FunctionCall(OpenAIObject):
    arguments: str
-    name: str
+    name: Optional[str] = None
 class Function(OpenAIObject):
    arguments: str
-    name: str
+    name: Optional[str] = None
 class ChatCompletionDeltaToolCall(OpenAIObject):
-    id: str
+    id: Optional[str] = None
    function: Function
-    type: str
+    type: Optional[str] = None
    index: int
@ -275,13 +275,19 @@ class Delta(OpenAIObject):
        super(Delta, self).__init__(**params)
        self.content = content
        self.role = role
        if function_call is not None and isinstance(function_call, dict):
            self.function_call = FunctionCall(**function_call)
        else:
            self.function_call = function_call
-        if tool_calls is not None and isinstance(tool_calls, dict):
+        if tool_calls is not None and isinstance(tool_calls, list):
            self.tool_calls = []
            for tool_call in tool_calls:
                if isinstance(tool_call, dict):
                    if tool_call.get("index", None) is None:
                        tool_call["index"] = 0
                    self.tool_calls.append(ChatCompletionDeltaToolCall(**tool_call))
                elif isinstance(tool_call, ChatCompletionDeltaToolCall):
                    self.tool_calls.append(tool_call)
        else:
            self.tool_calls = tool_calls
@ -1634,7 +1640,7 @@ class Logging:
            verbose_logger.debug(
                "Async success callbacks: Got a complete streaming response"
            )
-            self.model_call_details["complete_streaming_response"] = (
+            self.model_call_details["async_complete_streaming_response"] = (
                complete_streaming_response
            )
            try:
@ -1682,28 +1688,31 @@ class Logging:
                    print_verbose("async success_callback: reaches cache for logging!")
                    kwargs = self.model_call_details
                    if self.stream:
-                        if "complete_streaming_response" not in kwargs:
+                        if "async_complete_streaming_response" not in kwargs:
                            print_verbose(
-                                f"async success_callback: reaches cache for logging, there is no complete_streaming_response. Kwargs={kwargs}\n\n"
+                                f"async success_callback: reaches cache for logging, there is no async_complete_streaming_response. Kwargs={kwargs}\n\n"
                            )
                            pass
                        else:
                            print_verbose(
-                                "async success_callback: reaches cache for logging, there is a complete_streaming_response. Adding to cache"
+                                "async success_callback: reaches cache for logging, there is a async_complete_streaming_response. Adding to cache"
                            )
-                            result = kwargs["complete_streaming_response"]
+                            result = kwargs["async_complete_streaming_response"]
                            # only add to cache once we have a complete streaming response
                            litellm.cache.add_cache(result, **kwargs)
                if isinstance(callback, CustomLogger):  # custom logger class
                    print_verbose(
-                        f"Async success callbacks: {callback}; self.stream: {self.stream}; complete_streaming_response: {self.model_call_details.get('complete_streaming_response', None)}"
+                        f"Running Async success callback: {callback}; self.stream: {self.stream}; async_complete_streaming_response: {self.model_call_details.get('async_complete_streaming_response', None)} result={result}"
                    )
                    if self.stream == True:
-                        if "complete_streaming_response" in self.model_call_details:
+                        if (
                            "async_complete_streaming_response"
                            in self.model_call_details
                        ):
                            await callback.async_log_success_event(
                                kwargs=self.model_call_details,
                                response_obj=self.model_call_details[
-                                    "complete_streaming_response"
+                                    "async_complete_streaming_response"
                                ],
                                start_time=start_time,
                                end_time=end_time,
@ -1724,14 +1733,18 @@ class Logging:
                        )
                if callable(callback):  # custom logger functions
                    print_verbose(
-                        f"Making async function logging call - {self.model_call_details}"
+                        f"Making async function logging call for {callback}, result={result} - {self.model_call_details}"
                    )
                    if self.stream:
-                        if "complete_streaming_response" in self.model_call_details:
+                        if (
                            "async_complete_streaming_response"
                            in self.model_call_details
                        ):
                            await customLogger.async_log_event(
                                kwargs=self.model_call_details,
                                response_obj=self.model_call_details[
-                                    "complete_streaming_response"
+                                    "async_complete_streaming_response"
                                ],
                                start_time=start_time,
                                end_time=end_time,
@ -1752,14 +1765,17 @@ class Logging:
                    if dynamoLogger is None:
                        dynamoLogger = DyanmoDBLogger()
                    if self.stream:
-                        if "complete_streaming_response" in self.model_call_details:
+                        if (
                            "async_complete_streaming_response"
                            in self.model_call_details
                        ):
                            print_verbose(
                                "DynamoDB Logger: Got Stream Event - Completed Stream Response"
                            )
                            await dynamoLogger._async_log_event(
                                kwargs=self.model_call_details,
                                response_obj=self.model_call_details[
-                                    "complete_streaming_response"
+                                    "async_complete_streaming_response"
                                ],
                                start_time=start_time,
                                end_time=end_time,
@ -3713,6 +3729,54 @@ def completion_cost(
        raise e
 def supports_function_calling(model: str):
    """
    Check if the given model supports function calling and return a boolean value.
    Parameters:
    model (str): The model name to be checked.
    Returns:
    bool: True if the model supports function calling, False otherwise.
    Raises:
    Exception: If the given model is not found in model_prices_and_context_window.json.
    """
    if model in litellm.model_cost:
        model_info = litellm.model_cost[model]
        if model_info.get("supports_function_calling", False):
            return True
        return False
    else:
        raise Exception(
            f"Model not in model_prices_and_context_window.json. You passed model={model}."
        )
 def supports_parallel_function_calling(model: str):
    """
    Check if the given model supports parallel function calling and return True if it does, False otherwise.
    Parameters:
        model (str): The model to check for support of parallel function calling.
    Returns:
        bool: True if the model supports parallel function calling, False otherwise.
    Raises:
        Exception: If the model is not found in the model_cost dictionary.
    """
    if model in litellm.model_cost:
        model_info = litellm.model_cost[model]
        if model_info.get("supports_parallel_function_calling", False):
            return True
        return False
    else:
        raise Exception(
            f"Model not in model_prices_and_context_window.json. You passed model={model}."
        )
 ####### HELPER FUNCTIONS ################
 def register_model(model_cost: Union[str, dict]):
    """
@ -4041,6 +4105,7 @@ def get_optional_params(
            and custom_llm_provider != "vertex_ai"
            and custom_llm_provider != "anyscale"
            and custom_llm_provider != "together_ai"
            and custom_llm_provider != "mistral"
        ):
            if custom_llm_provider == "ollama" or custom_llm_provider == "ollama_chat":
                # ollama actually supports json output
@ -4711,7 +4776,14 @@ def get_optional_params(
        if max_tokens:
            optional_params["max_tokens"] = max_tokens
    elif custom_llm_provider == "mistral":
-        supported_params = ["temperature", "top_p", "stream", "max_tokens"]
+        supported_params = [
            "temperature",
            "top_p",
            "stream",
            "max_tokens",
            "tools",
            "tool_choice",
        ]
        _check_valid_arg(supported_params=supported_params)
        if temperature is not None:
            optional_params["temperature"] = temperature
@ -4721,6 +4793,10 @@ def get_optional_params(
            optional_params["stream"] = stream
        if max_tokens is not None:
            optional_params["max_tokens"] = max_tokens
        if tools is not None:
            optional_params["tools"] = tools
        if tool_choice is not None:
            optional_params["tool_choice"] = tool_choice
        # check safe_mode, random_seed: https://docs.mistral.ai/api/#operation/createChatCompletion
        safe_mode = passed_params.pop("safe_mode", None)
@ -6945,7 +7021,7 @@ def exception_type(
                if "500 An internal error has occurred." in error_str:
                    exception_mapping_worked = True
                    raise APIError(
-                        status_code=original_exception.status_code,
+                        status_code=getattr(original_exception, "status_code", 500),
                        message=f"PalmException - {original_exception.message}",
                        llm_provider="palm",
                        model=model,
@ -8728,7 +8804,7 @@ class CustomStreamWrapper:
                        or original_chunk.choices[0].delta.tool_calls is not None
                    ):
                        try:
-                            delta = dict(original_chunk.choices[0].delta)
+                            delta = original_chunk.choices[0].delta
                            model_response.system_fingerprint = (
                                original_chunk.system_fingerprint
                            )
@ -8763,7 +8839,9 @@ class CustomStreamWrapper:
                                                is None
                                            ):
                                                t.function.arguments = ""
-                            model_response.choices[0].delta = Delta(**delta)
+                            _json_delta = delta.model_dump()
                            print_verbose(f"_json_delta: {_json_delta}")
                            model_response.choices[0].delta = Delta(**_json_delta)
                        except Exception as e:
                            traceback.print_exc()
                            model_response.choices[0].delta = Delta()
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -6,7 +6,8 @@
        "input_cost_per_token": 0.00003,
        "output_cost_per_token": 0.00006,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true
    },
    "gpt-4-turbo-preview": {
        "max_tokens": 8192, 
@ -15,7 +16,9 @@
        "input_cost_per_token": 0.00001,
        "output_cost_per_token": 0.00003,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true
    },
    "gpt-4-0314": {
        "max_tokens": 8192,
@ -33,7 +36,8 @@
        "input_cost_per_token": 0.00003,
        "output_cost_per_token": 0.00006,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true
    },
    "gpt-4-32k": {
        "max_tokens": 32768,
@ -69,7 +73,9 @@
        "input_cost_per_token": 0.00001,
        "output_cost_per_token": 0.00003,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true
    },
    "gpt-4-0125-preview": {
        "max_tokens": 128000,
@ -78,7 +84,9 @@
        "input_cost_per_token": 0.00001,
        "output_cost_per_token": 0.00003,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true
    },
    "gpt-4-vision-preview": {
        "max_tokens": 128000,
@ -105,7 +113,8 @@
        "input_cost_per_token": 0.0000015,
        "output_cost_per_token": 0.000002,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true
    },
    "gpt-3.5-turbo-0301": {
        "max_tokens": 4097,
@ -123,7 +132,8 @@
        "input_cost_per_token": 0.0000015,
        "output_cost_per_token": 0.000002,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true
    },
    "gpt-3.5-turbo-1106": {
        "max_tokens": 16385,
@ -132,7 +142,9 @@
        "input_cost_per_token": 0.0000010,
        "output_cost_per_token": 0.0000020,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true
    },
    "gpt-3.5-turbo-0125": {
        "max_tokens": 16385,
@ -141,7 +153,9 @@
        "input_cost_per_token": 0.0000005,
        "output_cost_per_token": 0.0000015,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true
    },
    "gpt-3.5-turbo-16k": {
        "max_tokens": 16385,
@ -286,7 +300,9 @@
        "input_cost_per_token": 0.00001,
        "output_cost_per_token": 0.00003,
        "litellm_provider": "azure",
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true
    },
    "azure/gpt-4-1106-preview": {
        "max_tokens": 128000,
@ -295,7 +311,9 @@
        "input_cost_per_token": 0.00001,
        "output_cost_per_token": 0.00003,
        "litellm_provider": "azure",
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true
    },
    "azure/gpt-4-0613": {
        "max_tokens": 8192,
@ -304,7 +322,8 @@
        "input_cost_per_token": 0.00003,
        "output_cost_per_token": 0.00006,
        "litellm_provider": "azure",
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true
    },
    "azure/gpt-4-32k-0613": {
        "max_tokens": 32768,
@ -331,7 +350,8 @@
        "input_cost_per_token": 0.00003,
        "output_cost_per_token": 0.00006,
        "litellm_provider": "azure",
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true
    },
    "azure/gpt-4-turbo": {
        "max_tokens": 128000,
@ -340,7 +360,9 @@
        "input_cost_per_token": 0.00001,
        "output_cost_per_token": 0.00003,
        "litellm_provider": "azure", 
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true
    },
    "azure/gpt-4-turbo-vision-preview": {
        "max_tokens": 128000,
@ -358,7 +380,8 @@
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000004,
        "litellm_provider": "azure",
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true
    },
    "azure/gpt-35-turbo-1106": {
        "max_tokens": 16384,
@ -367,7 +390,20 @@
        "input_cost_per_token": 0.0000015,
        "output_cost_per_token": 0.000002,
        "litellm_provider": "azure",
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true
    },
    "azure/gpt-35-turbo-0125": {
        "max_tokens": 16384,
        "max_input_tokens": 16384,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.0000005,
        "output_cost_per_token": 0.0000015,
        "litellm_provider": "azure",
        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true
    },
    "azure/gpt-35-turbo-16k": {
        "max_tokens": 16385,
@ -385,7 +421,8 @@
        "input_cost_per_token": 0.0000015,
        "output_cost_per_token": 0.000002,
        "litellm_provider": "azure",
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true
    },
    "azure/ada": {
        "max_tokens": 8191,
@ -514,11 +551,12 @@
        "mode": "chat"
    },
    "mistral/mistral-large-latest": {
-        "max_tokens": 8192,
+        "max_tokens": 32000,
        "input_cost_per_token": 0.000008,
        "output_cost_per_token": 0.000024,
        "litellm_provider": "mistral",
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true
    },
    "mistral/mistral-embed": {
        "max_tokens": 8192,
@ -676,7 +714,8 @@
        "input_cost_per_token": 0.00000025, 
        "output_cost_per_token": 0.0000005,
        "litellm_provider": "vertex_ai-language-models",
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true
    },
    "gemini-1.5-pro": { 
        "max_tokens": 8192,
@ -687,6 +726,15 @@
        "litellm_provider": "vertex_ai-language-models",
        "mode": "chat"
    },
    "gemini-1.5-pro-preview-0215": { 
        "max_tokens": 8192,
        "max_input_tokens": 1000000,
        "max_output_tokens": 8192,
        "input_cost_per_token": 0, 
        "output_cost_per_token": 0,
        "litellm_provider": "vertex_ai-language-models",
        "mode": "chat"
    },
    "gemini-pro-vision": {
        "max_tokens": 16384,
        "max_output_tokens": 2048,
@ -1729,6 +1777,23 @@
        "output_cost_per_token": 0.0000009,
        "litellm_provider": "together_ai"
    },
    "together_ai/mistralai/Mixtral-8x7B-Instruct-v0.1": {
        "input_cost_per_token": 0.0000006,
        "output_cost_per_token": 0.0000006,
        "litellm_provider": "together_ai",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true
    },
    "together_ai/mistralai/Mistral-7B-Instruct-v0.1": {
        "litellm_provider": "together_ai",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true
    },
    "together_ai/togethercomputer/CodeLlama-34b-Instruct": {
        "litellm_provider": "together_ai",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true
    },
    "ollama/llama2": {
        "max_tokens": 4096,
        "input_cost_per_token": 0.0,
@ -1981,7 +2046,16 @@
        "input_cost_per_token": 0.00000015, 
        "output_cost_per_token": 0.00000015,
        "litellm_provider": "anyscale", 
-        "mode": "chat"
+        "mode": "chat",
        "supports_function_calling": true
      },
      "anyscale/Mixtral-8x7B-Instruct-v0.1": {
        "max_tokens": 16384, 
        "input_cost_per_token": 0.00000015, 
        "output_cost_per_token": 0.00000015,
        "litellm_provider": "anyscale", 
        "mode": "chat",
        "supports_function_calling": true
      },
      "anyscale/HuggingFaceH4/zephyr-7b-beta": {
        "max_tokens": 16384, 
--- a/proxy_server_config.yaml
+++ b/proxy_server_config.yaml
@ -40,6 +40,8 @@ litellm_settings:
  budget_duration: 30d
 general_settings: 
  master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
  proxy_budget_rescheduler_min_time: 30
  proxy_budget_rescheduler_max_time: 60
  # database_url: "postgresql://<user>:<password>@<host>:<port>/<dbname>" # [OPTIONAL] use for token-based auth to proxy
 environment_variables:
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.27.12"
+version = "1.27.15"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@ -74,7 +74,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"
 [tool.commitizen]
-version = "1.27.12"
+version = "1.27.15"
 version_files = [
    "pyproject.toml:^version"
 ]
--- a/requirements.txt
+++ b/requirements.txt
@ -10,6 +10,7 @@ gunicorn==21.2.0 # server dep
 boto3==1.34.34 # aws bedrock/sagemaker calls
 redis==5.0.0 # caching
 numpy==1.24.3 # semantic caching
 pandas==2.1.1 # for viewing clickhouse spend analytics
 prisma==0.11.0 # for db
 mangum==0.17.0 # for aws lambda functions
 google-generativeai==0.3.2 # for vertex ai calls
--- a/tests/test_keys.py
+++ b/tests/test_keys.py
@ -449,7 +449,7 @@ async def test_key_with_budgets():
        reset_at_init_value = key_info["info"]["budget_reset_at"]
        reset_at_new_value = None
        i = 0
-        await asyncio.sleep(610)
+        await asyncio.sleep(120)
        while i < 3:
            key_info = await get_key_info(session=session, get_key=key, call_key=key)
            reset_at_new_value = key_info["info"]["budget_reset_at"]
@ -490,6 +490,7 @@ async def test_key_crossing_budget():
            assert "ExceededTokenBudget: Current spend for token:" in str(e)
@pytest.mark.skip(reason="AWS Suspended Account")
@pytest.mark.asyncio
 async def test_key_info_spend_values_sagemaker():
    """
--- a/ui/litellm-dashboard/src/components/networking.tsx
+++ b/ui/litellm-dashboard/src/components/networking.tsx
@ -313,6 +313,7 @@ export const userSpendLogsCall = async (
  endTime: String
 ) => {
  try {
    console.log(`user role in spend logs call: ${userRole}`);
    let url = proxyBaseUrl ? `${proxyBaseUrl}/spend/logs` : `/spend/logs`;
    if (userRole == "App Owner") {
      url = `${url}/?user_id=${userID}&start_date=${startTime}&end_date=${endTime}`;
@ -343,6 +344,96 @@ export const userSpendLogsCall = async (
  }
 };
 export const adminSpendLogsCall = async (accessToken: String) => {
  try {
    let url = proxyBaseUrl
      ? `${proxyBaseUrl}/global/spend/logs`
      : `/global/spend/logs`;
    message.info("Making spend logs request");
    const response = await fetch(url, {
      method: "GET",
      headers: {
        Authorization: `Bearer ${accessToken}`,
        "Content-Type": "application/json",
      },
    });
    if (!response.ok) {
      const errorData = await response.text();
      message.error(errorData);
      throw new Error("Network response was not ok");
    }
    const data = await response.json();
    console.log(data);
    message.success("Spend Logs received");
    return data;
  } catch (error) {
    console.error("Failed to create key:", error);
    throw error;
  }
 };
 export const adminTopKeysCall = async (accessToken: String) => {
  try {
    let url = proxyBaseUrl
      ? `${proxyBaseUrl}/global/spend/keys?limit=5`
      : `/global/spend/keys?limit=5`;
    message.info("Making spend keys request");
    const response = await fetch(url, {
      method: "GET",
      headers: {
        Authorization: `Bearer ${accessToken}`,
        "Content-Type": "application/json",
      },
    });
    if (!response.ok) {
      const errorData = await response.text();
      message.error(errorData);
      throw new Error("Network response was not ok");
    }
    const data = await response.json();
    console.log(data);
    message.success("Spend Logs received");
    return data;
  } catch (error) {
    console.error("Failed to create key:", error);
    throw error;
  }
 };
 export const adminTopModelsCall = async (accessToken: String) => {
  try {
    let url = proxyBaseUrl
      ? `${proxyBaseUrl}/global/spend/models?limit=5`
      : `/global/spend/models?limit=5`;
    message.info("Making spend models request");
    const response = await fetch(url, {
      method: "GET",
      headers: {
        Authorization: `Bearer ${accessToken}`,
        "Content-Type": "application/json",
      },
    });
    if (!response.ok) {
      const errorData = await response.text();
      message.error(errorData);
      throw new Error("Network response was not ok");
    }
    const data = await response.json();
    console.log(data);
    message.success("Spend Logs received");
    return data;
  } catch (error) {
    console.error("Failed to create key:", error);
    throw error;
  }
 };
 export const keyInfoCall = async (accessToken: String, keys: String[]) => {
  try {
    let url = proxyBaseUrl ? `${proxyBaseUrl}/v2/key/info` : `/v2/key/info`;
--- a/ui/litellm-dashboard/src/components/usage.tsx
+++ b/ui/litellm-dashboard/src/components/usage.tsx
@ -2,7 +2,13 @@ import { BarChart, Card, Title } from "@tremor/react";
 import React, { useState, useEffect } from "react";
 import { Grid, Col, Text, LineChart } from "@tremor/react";
-import { userSpendLogsCall, keyInfoCall } from "./networking";
+import {
  userSpendLogsCall,
  keyInfoCall,
  adminSpendLogsCall,
  adminTopKeysCall,
  adminTopModelsCall,
 } from "./networking";
 import { start } from "repl";
 interface UsagePageProps {
@ -164,6 +170,25 @@ const UsagePage: React.FC<UsagePageProps> = ({
    if (accessToken && token && userRole && userID) {
      const fetchData = async () => {
        try {
          /**
           * If user is Admin - query the global views endpoints
           * If user is App Owner - use the normal spend logs call
           */
          console.log(`user role: ${userRole}`);
          if (userRole == "Admin") {
            const overall_spend = await adminSpendLogsCall(accessToken);
            setKeySpendData(overall_spend);
            const top_keys = await adminTopKeysCall(accessToken);
            const filtered_keys = top_keys.map((k: any) => ({
              key: (k["key_name"] || k["key_alias"] || k["api_key"]).substring(
                0,
                7
              ),
              spend: k["total_spend"],
            }));
            setTopKeys(filtered_keys);
            const top_models = await adminTopModelsCall(accessToken);
          } else if (userRole == "App Owner") {
            await userSpendLogsCall(
              accessToken,
              token,
@ -172,21 +197,34 @@ const UsagePage: React.FC<UsagePageProps> = ({
              startTime,
              endTime
            ).then(async (response) => {
              console.log("result from spend logs call", response);
              if ("daily_spend" in response) {
                // this is from clickhouse analytics
                //
                let daily_spend = response["daily_spend"];
                console.log("daily spend", daily_spend);
                setKeySpendData(daily_spend);
                let topApiKeys = response.top_api_keys;
                setTopKeys(topApiKeys);
              } else {
                const topKeysResponse = await keyInfoCall(
                  accessToken,
                  getTopKeys(response)
                );
                const filtered_keys = topKeysResponse["info"].map((k: any) => ({
-              key: (k["key_name"] || k["key_alias"] || k["token"]).substring(
+                  key: (
-                0,
+                    k["key_name"] ||
-                7
+                    k["key_alias"] ||
-              ),
+                    k["token"]
                  ).substring(0, 7),
                  spend: k["spend"],
                }));
                setTopKeys(filtered_keys);
                setTopUsers(getTopUsers(response));
                setKeySpendData(response);
              }
            });
          }
        } catch (error) {
          console.error("There was an error fetching the data", error);
          // Optionally, update your UI to reflect the error state here as well
@ -210,7 +248,7 @@ const UsagePage: React.FC<UsagePageProps> = ({
              valueFormatter={valueFormatter}
              yAxisWidth={100}
              tickGap={5}
-              customTooltip={customTooltip}
+              // customTooltip={customTooltip}
            />
          </Card>
        </Col>
		`@ -0,0 +1 @@`
							`self.__BUILD_MANIFEST={__rewrites:{afterFiles:[],beforeFiles:[],fallback:[]},"/_error":["static/chunks/pages/_error-d6107f1aac0c574c.js"],sortedPages:["/_app","/_error"]},self.__BUILD_MANIFEST_CB&&self.__BUILD_MANIFEST_CB();`
		`@ -0,0 +1 @@`
							`self.__SSG_MANIFEST=new Set([]);self.__SSG_MANIFEST_CB&&self.__SSG_MANIFEST_CB()`