added changes from upstream

Merge branch 'main' into fix/error-on-get-user-role
2024-05-09 16:14:14 -07:00 · 2024-05-09 16:14:14 -07:00 · d3a228d03b
commit d3a228d03b
parent c42f1ce2c6 43b2050cc2
142 changed files with 4439 additions and 801 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -188,7 +188,7 @@ jobs:
          command: |
            docker run -d \
              -p 4000:4000 \
-              -e DATABASE_URL=$PROXY_DOCKER_DB_URL \
+              -e DATABASE_URL=$PROXY_DATABASE_URL \
              -e AZURE_API_KEY=$AZURE_API_KEY \
              -e REDIS_HOST=$REDIS_HOST \
              -e REDIS_PASSWORD=$REDIS_PASSWORD \
@ -223,7 +223,7 @@ jobs:
          background: true
      - run: 
          name: Wait for app to be ready
-          command: dockerize -wait http://localhost:4000 -timeout 1m
+          command: dockerize -wait http://localhost:4000 -timeout 5m
      - run:
          name: Run tests
          command: |
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@ -0,0 +1,51 @@
 {
 	"name": "Python 3.11",
 	// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
 	"image": "mcr.microsoft.com/devcontainers/python:3.11-bookworm",
 	// https://github.com/devcontainers/images/tree/main/src/python
 	// https://mcr.microsoft.com/en-us/product/devcontainers/python/tags
 	// "build": {
 	// 	"dockerfile": "Dockerfile",
 	// 	"context": ".."
 	// },
 	// Features to add to the dev container. More info: https://containers.dev/features.
 	// "features": {},
 	// Configure tool-specific properties.
 	"customizations": {
 		// Configure properties specific to VS Code.
 		"vscode": {
 			"settings": {},
 			"extensions": [
 				"ms-python.python",
 				"ms-python.vscode-pylance",
 				"GitHub.copilot",
 				"GitHub.copilot-chat"
 			]
 		}
 	},
 	// Use 'forwardPorts' to make a list of ports inside the container available locally.
 	"forwardPorts": [4000],
 	"containerEnv": {
 		"LITELLM_LOG": "DEBUG"
 	},
 	// Use 'portsAttributes' to set default properties for specific forwarded ports. 
 	// More info: https://containers.dev/implementors/json_reference/#port-attributes
 	"portsAttributes": {
 		"4000": {
 			"label": "LiteLLM Server",
 			"onAutoForward": "notify"
 		}
 	},
 	// More info: https://aka.ms/dev-containers-non-root.
 	// "remoteUser": "litellm",
 	// Use 'postCreateCommand' to run commands after the container is created.
 	"postCreateCommand": "pipx install poetry && poetry install -E extra_proxy -E proxy"
 }
--- a/.github/workflows/interpret_load_test.py
+++ b/.github/workflows/interpret_load_test.py
@ -64,6 +64,11 @@ if __name__ == "__main__":
    )  # Replace with your repository's username and name
    latest_release = repo.get_latest_release()
    print("got latest release: ", latest_release)
    print(latest_release.title)
    print(latest_release.tag_name)
    release_version = latest_release.title
    print("latest release body: ", latest_release.body)
    print("markdown table: ", markdown_table)
@ -74,8 +79,22 @@ if __name__ == "__main__":
        start_index = latest_release.body.find("Load Test LiteLLM Proxy Results")
        existing_release_body = latest_release.body[:start_index]
    docker_run_command = f"""
 \n\n
 ## Docker Run LiteLLM Proxy
 ```
 docker run \\
 -e STORE_MODEL_IN_DB=True \\
 -p 4000:4000 \\
 ghcr.io/berriai/litellm:main-{release_version}
 ```
    """
    print("docker run command: ", docker_run_command)
    new_release_body = (
        existing_release_body
        + docker_run_command
        + "\n\n"
        + "### Don't want to maintain your internal proxy? get in touch 🎉"
        + "\nHosted Proxy Alpha: https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat"
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,6 @@
 .venv
 .env
 litellm/proxy/myenv/*
 litellm_uuid.txt
 __pycache__/
 *.pyc
@ -52,3 +53,6 @@ litellm/proxy/_new_secret_config.yaml
 litellm/proxy/_new_secret_config.yaml
 litellm/proxy/_super_secret_config.yaml
 litellm/proxy/_super_secret_config.yaml
 litellm/proxy/myenv/bin/activate
 litellm/proxy/myenv/bin/Activate.ps1
 myenv/*
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -16,11 +16,11 @@ repos:
        name: Check if files match
        entry: python3 ci_cd/check_files_match.py
        language: system
-   repo: local
+# -   repo: local
-    hooks:
+#     hooks:
-    -   id: mypy
+#     -   id: mypy
-        name: mypy
+#         name: mypy
-        entry: python3 -m mypy --ignore-missing-imports
+#         entry: python3 -m mypy --ignore-missing-imports
-        language: system
+#         language: system
-        types: [python]
+#         types: [python]
-        files: ^litellm/
+#         files: ^litellm/
--- a/README.md
+++ b/README.md
@ -226,6 +226,7 @@ curl 'http://0.0.0.0:4000/key/generate' \
 | [deepinfra](https://docs.litellm.ai/docs/providers/deepinfra)                       | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
 | [perplexity-ai](https://docs.litellm.ai/docs/providers/perplexity)                  | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
 | [Groq AI](https://docs.litellm.ai/docs/providers/groq)                              | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
 | [Deepseek](https://docs.litellm.ai/docs/providers/deepseek)                         | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
 | [anyscale](https://docs.litellm.ai/docs/providers/anyscale)                         | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
 | [IBM - watsonx.ai](https://docs.litellm.ai/docs/providers/watsonx)                  | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                | ✅ 
 | [voyage ai](https://docs.litellm.ai/docs/providers/voyage)                          |                                                         |                                                                                 |                                                                                     |                                                                                   | ✅                                                                            |
--- a/deploy/azure_resource_manager/azure_marketplace.zip
+++ b/deploy/azure_resource_manager/azure_marketplace.zip
--- a/deploy/azure_resource_manager/azure_marketplace/createUiDefinition.json
+++ b/deploy/azure_resource_manager/azure_marketplace/createUiDefinition.json
@ -0,0 +1,15 @@
 {
    "$schema": "https://schema.management.azure.com/schemas/0.1.2-preview/CreateUIDefinition.MultiVm.json#",
    "handler": "Microsoft.Azure.CreateUIDef",
    "version": "0.1.2-preview",
    "parameters": {
        "config": {
            "isWizard": false,
            "basics": { }
        },
        "basics": [ ],
        "steps": [ ],
        "outputs": { },
        "resourceTypes": [ ]
    }
 }
--- a/deploy/azure_resource_manager/azure_marketplace/mainTemplate.json
+++ b/deploy/azure_resource_manager/azure_marketplace/mainTemplate.json
@ -0,0 +1,63 @@
 {
    "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
    "contentVersion": "1.0.0.0",
    "parameters": {
      "imageName": {
        "type": "string",
        "defaultValue": "ghcr.io/berriai/litellm:main-latest"
      },
      "containerName": {
        "type": "string",
        "defaultValue": "litellm-container"
      },
      "dnsLabelName": {
        "type": "string",
        "defaultValue": "litellm"
      },
      "portNumber": {
        "type": "int",
        "defaultValue": 4000
      }
    },
    "resources": [
      {
        "type": "Microsoft.ContainerInstance/containerGroups",
        "apiVersion": "2021-03-01",
        "name": "[parameters('containerName')]",
        "location": "[resourceGroup().location]",
        "properties": {
          "containers": [
            {
              "name": "[parameters('containerName')]",
              "properties": {
                "image": "[parameters('imageName')]",
                "resources": {
                  "requests": {
                    "cpu": 1,
                    "memoryInGB": 2
                  }
                },
                "ports": [
                  {
                    "port": "[parameters('portNumber')]"
                  }
                ]
              }
            }
          ],
          "osType": "Linux",
          "restartPolicy": "Always",
          "ipAddress": {
            "type": "Public",
            "ports": [
              {
                "protocol": "tcp",
                "port": "[parameters('portNumber')]"
              }
            ],
            "dnsNameLabel": "[parameters('dnsLabelName')]"
          }
        }
      }
    ]
  }
--- a/deploy/azure_resource_manager/main.bicep
+++ b/deploy/azure_resource_manager/main.bicep
@ -0,0 +1,42 @@
 param imageName string = 'ghcr.io/berriai/litellm:main-latest'
 param containerName string = 'litellm-container'
 param dnsLabelName string = 'litellm'
 param portNumber int = 4000
 resource containerGroupName 'Microsoft.ContainerInstance/containerGroups@2021-03-01' = {
  name: containerName
  location: resourceGroup().location
  properties: {
    containers: [
      {
        name: containerName
        properties: {
          image: imageName
          resources: {
            requests: {
              cpu: 1
              memoryInGB: 2
            }
          }
          ports: [
            {
              port: portNumber
            }
          ]
        }
      }
    ]
    osType: 'Linux'
    restartPolicy: 'Always'
    ipAddress: {
      type: 'Public'
      ports: [
        {
          protocol: 'tcp'
          port: portNumber
        }
      ]
      dnsNameLabel: dnsLabelName
    }
  }
 }
--- a/docs/my-website/docs/completion/input.md
+++ b/docs/my-website/docs/completion/input.md
@ -83,6 +83,7 @@ def completion(
    top_p: Optional[float] = None,
    n: Optional[int] = None,
    stream: Optional[bool] = None,
    stream_options: Optional[dict] = None,
    stop=None,
    max_tokens: Optional[int] = None,
    presence_penalty: Optional[float] = None,
@ -139,6 +140,10 @@ def completion(
 - `stream`: *boolean or null (optional)* - If set to true, it sends partial message deltas. Tokens will be sent as they become available, with the stream terminated by a [DONE] message.
 - `stream_options` *dict or null (optional)* - Options for streaming response. Only set this when you set `stream: true`
    - `include_usage` *boolean (optional)* - If set, an additional chunk will be streamed before the data: [DONE] message. The usage field on this chunk shows the token usage statistics for the entire request, and the choices field will always be an empty array. All other chunks will also include a usage field, but with a null value. 
 - `stop`: *string/ array/ null (optional)* - Up to 4 sequences where the API will stop generating further tokens.
 - `max_tokens`: *integer (optional)* - The maximum number of tokens to generate in the chat completion.
--- a/docs/my-website/docs/hosted.md
+++ b/docs/my-website/docs/hosted.md
@ -47,3 +47,12 @@ Pricing is based on usage. We can figure out a price that works for your team, o
 <Image img={require('../img/litellm_hosted_ui_router.png')} />
 #### [**🚨 Schedule Call**](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
 ## Feature List 
 - Easy way to add/remove models
 - 100% uptime even when models are added/removed
 - custom callback webhooks
 - your domain name with HTTPS
 - Ability to create/delete User API keys
 - Reasonable set monthly cost
--- a/docs/my-website/docs/langchain/langchain.md
+++ b/docs/my-website/docs/langchain/langchain.md
@ -14,14 +14,14 @@ import TabItem from '@theme/TabItem';
 ```python
 import os
-from langchain.chat_models import ChatLiteLLM
+from langchain_community.chat_models import ChatLiteLLM
-from langchain.prompts.chat import (
+from langchain_core.prompts import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
 )
-from langchain.schema import AIMessage, HumanMessage, SystemMessage
+from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
 os.environ['OPENAI_API_KEY'] = ""
 chat = ChatLiteLLM(model="gpt-3.5-turbo")
@ -30,7 +30,7 @@ messages = [
        content="what model are you"
    )
 ]
-chat(messages)
+chat.invoke(messages)
 ```
 </TabItem>
@ -39,14 +39,14 @@ chat(messages)
 ```python
 import os
-from langchain.chat_models import ChatLiteLLM
+from langchain_community.chat_models import ChatLiteLLM
-from langchain.prompts.chat import (
+from langchain_core.prompts import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
 )
-from langchain.schema import AIMessage, HumanMessage, SystemMessage
+from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
 os.environ['ANTHROPIC_API_KEY'] = ""
 chat = ChatLiteLLM(model="claude-2", temperature=0.3)
@ -55,7 +55,7 @@ messages = [
        content="what model are you"
    )
 ]
-chat(messages)
+chat.invoke(messages)
 ```
 </TabItem>
@ -64,14 +64,14 @@ chat(messages)
 ```python
 import os
-from langchain.chat_models import ChatLiteLLM
+from langchain_community.chat_models import ChatLiteLLM
-from langchain.prompts.chat import (
+from langchain_core.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
 )
-from langchain.schema import AIMessage, HumanMessage, SystemMessage
+from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
 os.environ['REPLICATE_API_TOKEN'] = ""
 chat = ChatLiteLLM(model="replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1")
@ -80,7 +80,7 @@ messages = [
        content="what model are you?"
    )
 ]
-chat(messages)
+chat.invoke(messages)
 ```
 </TabItem>
@ -89,14 +89,14 @@ chat(messages)
 ```python
 import os
-from langchain.chat_models import ChatLiteLLM
+from langchain_community.chat_models import ChatLiteLLM
-from langchain.prompts.chat import (
+from langchain_core.prompts import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
 )
-from langchain.schema import AIMessage, HumanMessage, SystemMessage
+from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
 os.environ['COHERE_API_KEY'] = ""
 chat = ChatLiteLLM(model="command-nightly")
@ -105,32 +105,9 @@ messages = [
        content="what model are you?"
    )
 ]
-chat(messages)
+chat.invoke(messages)
 ```
 </TabItem>
 <TabItem value="palm" label="PaLM - Google">
 ```python
 import os
 from langchain.chat_models import ChatLiteLLM
 from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
 )
 from langchain.schema import AIMessage, HumanMessage, SystemMessage
 os.environ['PALM_API_KEY'] = ""
 chat = ChatLiteLLM(model="palm/chat-bison")
 messages = [
    HumanMessage(
        content="what model are you?"
    )
 ]
 chat(messages)
 ```
 </TabItem>
 </Tabs>
--- a/docs/my-website/docs/observability/langfuse_integration.md
+++ b/docs/my-website/docs/observability/langfuse_integration.md
@ -94,9 +94,10 @@ print(response)
 ```
-### Set Custom Trace ID, Trace User ID and Tags
+### Set Custom Trace ID, Trace User ID, Trace Metadata, Trace Version, Trace Release and Tags
 Pass `trace_id`, `trace_user_id`, `trace_metadata`, `trace_version`, `trace_release`, `tags` in `metadata`
 Pass `trace_id`, `trace_user_id` in `metadata`
 ```python
 import litellm
@ -121,12 +122,20 @@ response = completion(
  metadata={
      "generation_name": "ishaan-test-generation",  # set langfuse Generation Name
      "generation_id": "gen-id22",                  # set langfuse Generation ID 
      "version":  "test-generation-version"         # set langfuse Generation Version
      "trace_user_id": "user-id2",                  # set langfuse Trace User ID
      "session_id": "session-1",                    # set langfuse Session ID
-      "tags": ["tag1", "tag2"]                      # set langfuse Tags
+      "tags": ["tag1", "tag2"],                     # set langfuse Tags
      "trace_id": "trace-id22",                     # set langfuse Trace ID
      "trace_metadata": {"key": "value"},           # set langfuse Trace Metadata
      "trace_version": "test-trace-version",        # set langfuse Trace Version (if not set, defaults to Generation Version)
      "trace_release": "test-trace-release",        # set langfuse Trace Release
      ### OR ### 
      "existing_trace_id": "trace-id22",            # if generation is continuation of past trace. This prevents default behaviour of setting a trace name
      ### OR enforce that certain fields are trace overwritten in the trace during the continuation ###
      "existing_trace_id": "trace-id22",
      "trace_metadata": {"key": "updated_trace_value"},            # The new value to use for the langfuse Trace Metadata
      "update_trace_keys": ["input", "output", "trace_metadata"],  # Updates the trace input & output to be this generations input & output also updates the Trace Metadata to match the passed in value
  },
 )
@ -134,6 +143,38 @@ print(response)
 ```
 ### Trace & Generation Parameters
 #### Trace Specific Parameters
 * `trace_id`       - Identifier for the trace, must use `existing_trace_id` instead or in conjunction with `trace_id` if this is an existing trace, auto-generated by default
 * `trace_name`     - Name of the trace, auto-generated by default
 * `session_id`     - Session identifier for the trace, defaults to `None`
 * `trace_version`  - Version for the trace, defaults to value for `version`
 * `trace_release`  - Release for the trace, defaults to `None`
 * `trace_metadata` - Metadata for the trace, defaults to `None`
 * `trace_user_id`  - User identifier for the trace, defaults to completion argument `user`
 * `tags`           - Tags for the trace, defeaults to `None`
 ##### Updatable Parameters on Continuation
 The following parameters can be updated on a continuation of a trace by passing in the following values into the `update_trace_keys` in the metadata of the completion.
 * `input`          - Will set the traces input to be the input of this latest generation
 * `output`         - Will set the traces output to be the output of this generation
 * `trace_version`  - Will set the trace version to be the provided value (To use the latest generations version instead, use `version`)
 * `trace_release`  - Will set the trace release to be the provided value
 * `trace_metadata` - Will set the trace metadata to the provided value
 * `trace_user_id`  - Will set the trace user id to the provided value
 #### Generation Specific Parameters
 * `generation_id`   - Identifier for the generation, auto-generated by default
 * `generation_name` - Identifier for the generation, auto-generated by default
 * `prompt`          - Langfuse prompt object used for the generation, defaults to None
 Any other key value pairs passed into the metadata not listed in the above spec for a `litellm` completion will be added as a metadata key value pair for the generation.
 ### Use LangChain ChatLiteLLM + Langfuse
 Pass `trace_user_id`, `session_id` in model_kwargs
 ```python
--- a/docs/my-website/docs/providers/deepseek.md
+++ b/docs/my-website/docs/providers/deepseek.md
@ -0,0 +1,54 @@
 # Deepseek
 https://deepseek.com/
 **We support ALL Deepseek models, just set `deepseek/` as a prefix when sending completion requests**
 ## API Key
 ```python
 # env variable
 os.environ['DEEPSEEK_API_KEY']
 ```
 ## Sample Usage
 ```python
 from litellm import completion
 import os
 os.environ['DEEPSEEK_API_KEY'] = ""
 response = completion(
    model="deepseek/deepseek-chat", 
    messages=[
       {"role": "user", "content": "hello from litellm"}
   ],
 )
 print(response)
 ```
 ## Sample Usage - Streaming
 ```python
 from litellm import completion
 import os
 os.environ['DEEPSEEK_API_KEY'] = ""
 response = completion(
    model="deepseek/deepseek-chat", 
    messages=[
       {"role": "user", "content": "hello from litellm"}
   ],
    stream=True
 )
 for chunk in response:
    print(chunk)
 ```
 ## Supported Models - ALL Deepseek Models Supported!
 We support ALL Deepseek models, just set `deepseek/` as a prefix when sending completion requests
 | Model Name               | Function Call                                                                                                                                                      |
 |--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | deepseek-chat | `completion(model="deepseek/deepseek-chat", messages)` | 
 | deepseek-coder | `completion(model="deepseek/deepseek-chat", messages)` | 
--- a/docs/my-website/docs/providers/mistral.md
+++ b/docs/my-website/docs/providers/mistral.md
@ -45,13 +45,13 @@ for chunk in response:
 All models listed here https://docs.mistral.ai/platform/endpoints are supported. We actively maintain the list of models, pricing, token window, etc. [here](https://github.com/BerriAI/litellm/blob/c1b25538277206b9f00de5254d80d6a83bb19a29/model_prices_and_context_window.json).
 | Model Name     | Function Call                                                |
-|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+|----------------|--------------------------------------------------------------|
-| mistral-tiny | `completion(model="mistral/mistral-tiny", messages)` | 
+| Mistral Small  | `completion(model="mistral/mistral-small-latest", messages)` |
-| mistral-small | `completion(model="mistral/mistral-small", messages)` | 
+| Mistral Medium | `completion(model="mistral/mistral-medium-latest", messages)`|
-| mistral-medium | `completion(model="mistral/mistral-medium", messages)` | 
+| Mistral Large  | `completion(model="mistral/mistral-large-latest", messages)` |
-| mistral-large-latest | `completion(model="mistral/mistral-large-latest", messages)` | 
+| Mistral 7B     | `completion(model="mistral/open-mistral-7b", messages)`      |
-| open-mixtral-8x22b | `completion(model="mistral/open-mixtral-8x22b", messages)` | 
+| Mixtral 8x7B   | `completion(model="mistral/open-mixtral-8x7b", messages)`    |
-
+| Mixtral 8x22B  | `completion(model="mistral/open-mixtral-8x22b", messages)`   |
 ## Function Calling 
@ -116,6 +116,6 @@ All models listed here https://docs.mistral.ai/platform/endpoints are supported
 | Model Name               | Function Call                                                                                                                                                      |
 |--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| mistral-embed | `embedding(model="mistral/mistral-embed", input)` | 
+| Mistral Embeddings | `embedding(model="mistral/mistral-embed", input)` | 
--- a/docs/my-website/docs/proxy/token_auth.md
+++ b/docs/my-website/docs/proxy/token_auth.md
@ -17,6 +17,7 @@ This is a new feature, and subject to changes based on feedback.
 ### Step 1. Setup Proxy
 - `JWT_PUBLIC_KEY_URL`: This is the public keys endpoint of your OpenID provider. Typically it's `{openid-provider-base-url}/.well-known/openid-configuration/jwks`. For Keycloak it's `{keycloak_base_url}/realms/{your-realm}/protocol/openid-connect/certs`.
 - `JWT_AUDIENCE`: This is the audience used for decoding the JWT. If not set, the decode step will not verify the audience. 
 ```bash
 export JWT_PUBLIC_KEY_URL="" # "https://demo.duendesoftware.com/.well-known/openid-configuration/jwks"
--- a/docs/my-website/docs/proxy/users.md
+++ b/docs/my-website/docs/proxy/users.md
@ -12,8 +12,8 @@ Requirements:
 You can set budgets at 3 levels: 
 - For the proxy 
- For a user 
+- For an internal user 
- For a 'user' passed to `/chat/completions`, `/embeddings` etc
+- For an end-user
 - For a key
 - For a key (model specific budgets)
@ -58,7 +58,7 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
 }'
 ```
 </TabItem>
-<TabItem value="per-user" label="For User">
+<TabItem value="per-user" label="For Internal User">
 Apply a budget across multiple keys.
@ -165,12 +165,12 @@ curl --location 'http://localhost:4000/team/new' \
 }
 ```
 </TabItem>
-<TabItem value="per-user-chat" label="For 'user' passed to /chat/completions">
+<TabItem value="per-user-chat" label="For End User">
 Use this to budget `user` passed to `/chat/completions`, **without needing to create a key for every user**
 **Step 1. Modify config.yaml**
-Define `litellm.max_user_budget`
+Define `litellm.max_end_user_budget`
 ```yaml
 general_settings:
  master_key: sk-1234
@ -328,7 +328,7 @@ You can set:
 - max parallel requests
 <Tabs>
-<TabItem value="per-user" label="Per User">
+<TabItem value="per-user" label="Per Internal User">
 Use `/user/new`, to persist rate limits across multiple keys.
@ -408,7 +408,7 @@ curl --location 'http://localhost:4000/user/new' \
 ```
-## Create new keys for existing user
+## Create new keys for existing internal user
 Just include user_id in the `/key/generate` request.
--- a/docs/my-website/docs/routing.md
+++ b/docs/my-website/docs/routing.md
@ -96,7 +96,7 @@ print(response)
 - `router.aimage_generation()` - async image generation calls
 ## Advanced - Routing Strategies
-#### Routing Strategies - Weighted Pick, Rate Limit Aware, Least Busy, Latency Based
+#### Routing Strategies - Weighted Pick, Rate Limit Aware, Least Busy, Latency Based, Cost Based
 Router provides 4 strategies for routing your calls across multiple deployments: 
@ -467,6 +467,101 @@ async def router_acompletion():
 asyncio.run(router_acompletion())
 ```
 </TabItem>
 <TabItem value="lowest-cost" label="Lowest Cost Routing (Async)">
 Picks a deployment based on the lowest cost
 How this works:
 - Get all healthy deployments
 - Select all deployments that are under their provided `rpm/tpm` limits
 - For each deployment check if `litellm_param["model"]` exists in [`litellm_model_cost_map`](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json) 
 	- if deployment does not exist in `litellm_model_cost_map` -> use deployment_cost= `$1`
 - Select deployment with lowest cost
 ```python
 from litellm import Router 
 import asyncio
 model_list =  [
 	{
 		"model_name": "gpt-3.5-turbo",
 		"litellm_params": {"model": "gpt-4"},
 		"model_info": {"id": "openai-gpt-4"},
 	},
 	{
 		"model_name": "gpt-3.5-turbo",
 		"litellm_params": {"model": "groq/llama3-8b-8192"},
 		"model_info": {"id": "groq-llama"},
 	},
 ]
 # init router
 router = Router(model_list=model_list, routing_strategy="cost-based-routing")
 async def router_acompletion():
 	response = await router.acompletion(
 		model="gpt-3.5-turbo", 
 		messages=[{"role": "user", "content": "Hey, how's it going?"}]
 	)
 	print(response)
 	print(response._hidden_params["model_id"]) # expect groq-llama, since groq/llama has lowest cost
 	return response
 asyncio.run(router_acompletion())
 ```
 #### Using Custom Input/Output pricing
 Set `litellm_params["input_cost_per_token"]` and `litellm_params["output_cost_per_token"]` for using custom pricing when routing
 ```python
 model_list = [
 	{
 		"model_name": "gpt-3.5-turbo",
 		"litellm_params": {
 			"model": "azure/chatgpt-v-2",
 			"input_cost_per_token": 0.00003,
 			"output_cost_per_token": 0.00003,
 		},
 		"model_info": {"id": "chatgpt-v-experimental"},
 	},
 	{
 		"model_name": "gpt-3.5-turbo",
 		"litellm_params": {
 			"model": "azure/chatgpt-v-1",
 			"input_cost_per_token": 0.000000001,
 			"output_cost_per_token": 0.00000001,
 		},
 		"model_info": {"id": "chatgpt-v-1"},
 	},
 	{
 		"model_name": "gpt-3.5-turbo",
 		"litellm_params": {
 			"model": "azure/chatgpt-v-5",
 			"input_cost_per_token": 10,
 			"output_cost_per_token": 12,
 		},
 		"model_info": {"id": "chatgpt-v-5"},
 	},
 ]
 # init router
 router = Router(model_list=model_list, routing_strategy="cost-based-routing")
 async def router_acompletion():
 	response = await router.acompletion(
 		model="gpt-3.5-turbo", 
 		messages=[{"role": "user", "content": "Hey, how's it going?"}]
 	)
 	print(response)
 	print(response._hidden_params["model_id"]) # expect chatgpt-v-1, since chatgpt-v-1 has lowest cost
 	return response
 asyncio.run(router_acompletion())
 ```
 </TabItem>
 </Tabs>
@ -991,6 +1086,46 @@ async def test_acompletion_caching_on_router_caching_groups():
 asyncio.run(test_acompletion_caching_on_router_caching_groups())
 ```
 ## Alerting 🚨
 Send alerts to slack / your webhook url for the following events
 - LLM API Exceptions
 - Slow LLM Responses
 Get a slack webhook url from https://api.slack.com/messaging/webhooks
 #### Usage
 Initialize an `AlertingConfig` and pass it to `litellm.Router`. The following code will trigger an alert because `api_key=bad-key` which is invalid
 ```python
 from litellm.router import AlertingConfig
 import litellm
 import os
 router = litellm.Router(
 	model_list=[
 		{
 			"model_name": "gpt-3.5-turbo",
 			"litellm_params": {
 				"model": "gpt-3.5-turbo",
 				"api_key": "bad_key",
 			},
 		}
 	],
 	alerting_config= AlertingConfig(
 		alerting_threshold=10,                        # threshold for slow / hanging llm responses (in seconds). Defaults to 300 seconds
 		webhook_url= os.getenv("SLACK_WEBHOOK_URL")   # webhook you want to send alerts to
 	),
 )
 try:
 	await router.acompletion(
 		model="gpt-3.5-turbo",
 		messages=[{"role": "user", "content": "Hey, how's it going?"}],
 	)
 except:
 	pass
 ```
 ## Track cost for Azure Deployments
 **Problem**: Azure returns `gpt-4` in the response when `azure/gpt-4-1106-preview` is used. This leads to inaccurate cost tracking
@ -1159,6 +1294,7 @@ def __init__(
 		"least-busy",
 		"usage-based-routing",
 		"latency-based-routing",
 		"cost-based-routing",
 	] = "simple-shuffle",
 	## DEBUGGING ##
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -134,6 +134,7 @@ const sidebars = {
        "providers/ollama", 
        "providers/perplexity", 
        "providers/groq", 
        "providers/deepseek", 
        "providers/fireworks_ai", 
        "providers/vllm", 
        "providers/xinference", 
--- a/enterprise/utils.py
+++ b/enterprise/utils.py
@ -291,7 +291,7 @@ def _create_clickhouse_aggregate_tables(client=None, table_names=[]):
 def _forecast_daily_cost(data: list):
-    import requests
+    import requests  # type: ignore
    from datetime import datetime, timedelta
    if len(data) == 0:
--- a/litellm/init.py
+++ b/litellm/init.py
@ -361,6 +361,7 @@ openai_compatible_endpoints: List = [
    "api.deepinfra.com/v1/openai",
    "api.mistral.ai/v1",
    "api.groq.com/openai/v1",
    "api.deepseek.com/v1",
    "api.together.xyz/v1",
 ]
@ -369,6 +370,7 @@ openai_compatible_providers: List = [
    "anyscale",
    "mistral",
    "groq",
    "deepseek",
    "deepinfra",
    "perplexity",
    "xinference",
@ -523,6 +525,7 @@ provider_list: List = [
    "anyscale",
    "mistral",
    "groq",
    "deepseek",
    "maritalk",
    "voyage",
    "cloudflare",
--- a/litellm/_redis.py
+++ b/litellm/_redis.py
@ -10,8 +10,8 @@
 # s/o [@Frank Colson](https://www.linkedin.com/in/frank-colson-422b9b183/) for this redis implementation
 import os
 import inspect
-import redis, litellm
+import redis, litellm  # type: ignore
-import redis.asyncio as async_redis
+import redis.asyncio as async_redis  # type: ignore
 from typing import List, Optional
--- a/litellm/budget_manager.py
+++ b/litellm/budget_manager.py
@ -10,7 +10,7 @@
 import os, json, time
 import litellm
 from litellm.utils import ModelResponse
-import requests, threading
+import requests, threading  # type: ignore
 from typing import Optional, Union, Literal
--- a/litellm/caching.py
+++ b/litellm/caching.py
@ -106,7 +106,7 @@ class InMemoryCache(BaseCache):
            return_val.append(val)
        return return_val
-    async def async_increment(self, key, value: int, **kwargs) -> int:
+    async def async_increment(self, key, value: float, **kwargs) -> float:
        # get the value
        init_value = await self.async_get_cache(key=key) or 0
        value = init_value + value
@ -423,12 +423,12 @@ class RedisCache(BaseCache):
        if len(self.redis_batch_writing_buffer) >= self.redis_flush_size:
            await self.flush_cache_buffer()  # logging done in here
-    async def async_increment(self, key, value: int, **kwargs) -> int:
+    async def async_increment(self, key, value: float, **kwargs) -> float:
        _redis_client = self.init_async_client()
        start_time = time.time()
        try:
            async with _redis_client as redis_client:
-                result = await redis_client.incr(name=key, amount=value)
+                result = await redis_client.incrbyfloat(name=key, amount=value)
                ## LOGGING ##
                end_time = time.time()
                _duration = end_time - start_time
@ -1382,18 +1382,41 @@ class DualCache(BaseCache):
            print_verbose(f"LiteLLM Cache: Excepton async add_cache: {str(e)}")
            traceback.print_exc()
    async def async_batch_set_cache(
        self, cache_list: list, local_only: bool = False, **kwargs
    ):
        """
        Batch write values to the cache
        """
        print_verbose(
            f"async batch set cache: cache keys: {cache_list}; local_only: {local_only}"
        )
        try:
            if self.in_memory_cache is not None:
                await self.in_memory_cache.async_set_cache_pipeline(
                    cache_list=cache_list, **kwargs
                )
            if self.redis_cache is not None and local_only == False:
                await self.redis_cache.async_set_cache_pipeline(
                    cache_list=cache_list, ttl=kwargs.get("ttl", None)
                )
        except Exception as e:
            print_verbose(f"LiteLLM Cache: Excepton async add_cache: {str(e)}")
            traceback.print_exc()
    async def async_increment_cache(
-        self, key, value: int, local_only: bool = False, **kwargs
+        self, key, value: float, local_only: bool = False, **kwargs
-    ) -> int:
+    ) -> float:
        """
        Key - the key in cache
-        Value - int - the value you want to increment by
+        Value - float - the value you want to increment by
-        Returns - int - the incremented value
+        Returns - float - the incremented value
        """
        try:
-            result: int = value
+            result: float = value
            if self.in_memory_cache is not None:
                result = await self.in_memory_cache.async_increment(
                    key, value, **kwargs
--- a/litellm/integrations/aispend.py
+++ b/litellm/integrations/aispend.py
@ -1,7 +1,6 @@
 #### What this does ####
 #    On success + failure, log events to aispend.io
 import dotenv, os
 import requests
 dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
--- a/litellm/integrations/athina.py
+++ b/litellm/integrations/athina.py
@ -4,18 +4,30 @@ import datetime
 class AthinaLogger:
    def __init__(self):
        import os
        self.athina_api_key = os.getenv("ATHINA_API_KEY")
        self.headers = {
            "athina-api-key": self.athina_api_key,
-            "Content-Type": "application/json"
+            "Content-Type": "application/json",
        }
        self.athina_logging_url = "https://log.athina.ai/api/v1/log/inference"
-        self.additional_keys = ["environment", "prompt_slug", "customer_id", "customer_user_id", "session_id", "external_reference_id", "context", "expected_response", "user_query"]
+        self.additional_keys = [
            "environment",
            "prompt_slug",
            "customer_id",
            "customer_user_id",
            "session_id",
            "external_reference_id",
            "context",
            "expected_response",
            "user_query",
        ]
    def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose):
-        import requests
+        import requests  # type: ignore
        import json
        import traceback
        try:
            response_json = response_obj.model_dump() if response_obj else {}
            data = {
@ -23,19 +35,30 @@ class AthinaLogger:
                "request": kwargs,
                "response": response_json,
                "prompt_tokens": response_json.get("usage", {}).get("prompt_tokens"),
-                "completion_tokens": response_json.get("usage", {}).get("completion_tokens"),
+                "completion_tokens": response_json.get("usage", {}).get(
                    "completion_tokens"
                ),
                "total_tokens": response_json.get("usage", {}).get("total_tokens"),
            }
-            if type(end_time) == datetime.datetime and type(start_time) == datetime.datetime:
+            if (
-                data["response_time"] = int((end_time - start_time).total_seconds() * 1000)
+                type(end_time) == datetime.datetime
                and type(start_time) == datetime.datetime
            ):
                data["response_time"] = int(
                    (end_time - start_time).total_seconds() * 1000
                )
            if "messages" in kwargs:
                data["prompt"] = kwargs.get("messages", None)
            # Directly add tools or functions if present
            optional_params = kwargs.get("optional_params", {})
-            data.update((k, v) for k, v in optional_params.items() if k in ["tools", "functions"])
+            data.update(
                (k, v)
                for k, v in optional_params.items()
                if k in ["tools", "functions"]
            )
            # Add additional metadata keys
            metadata = kwargs.get("litellm_params", {}).get("metadata", {})
@ -44,11 +67,19 @@ class AthinaLogger:
                    if key in metadata:
                        data[key] = metadata[key]
-            response = requests.post(self.athina_logging_url, headers=self.headers, data=json.dumps(data, default=str))
+            response = requests.post(
                self.athina_logging_url,
                headers=self.headers,
                data=json.dumps(data, default=str),
            )
            if response.status_code != 200:
-                print_verbose(f"Athina Logger Error - {response.text}, {response.status_code}")
+                print_verbose(
                    f"Athina Logger Error - {response.text}, {response.status_code}"
                )
            else:
                print_verbose(f"Athina Logger Succeeded - {response.text}")
        except Exception as e:
-            print_verbose(f"Athina Logger Error - {e}, Stack trace: {traceback.format_exc()}")
+            print_verbose(
                f"Athina Logger Error - {e}, Stack trace: {traceback.format_exc()}"
            )
            pass
--- a/litellm/integrations/berrispend.py
+++ b/litellm/integrations/berrispend.py
@ -1,7 +1,7 @@
 #### What this does ####
 #    On success + failure, log events to aispend.io
 import dotenv, os
-import requests
+import requests  # type: ignore
 dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
--- a/litellm/integrations/clickhouse.py
+++ b/litellm/integrations/clickhouse.py
@ -3,7 +3,6 @@
 #### What this does ####
 #    On success, logs events to Promptlayer
 import dotenv, os
 import requests
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.caching import DualCache
--- a/litellm/integrations/custom_logger.py
+++ b/litellm/integrations/custom_logger.py
@ -1,7 +1,6 @@
 #### What this does ####
 #    On success, logs events to Promptlayer
 import dotenv, os
 import requests
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.caching import DualCache
--- a/litellm/integrations/datadog.py
+++ b/litellm/integrations/datadog.py
@ -2,7 +2,7 @@
 #    On success + failure, log events to Supabase
 import dotenv, os
-import requests
+import requests  # type: ignore
 dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
--- a/litellm/integrations/dynamodb.py
+++ b/litellm/integrations/dynamodb.py
@ -2,7 +2,7 @@
 #    On success + failure, log events to Supabase
 import dotenv, os
-import requests
+import requests  # type: ignore
 dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
--- a/litellm/integrations/greenscale.py
+++ b/litellm/integrations/greenscale.py
@ -1,15 +1,17 @@
-import requests
+import requests  # type: ignore
 import json
 import traceback
 from datetime import datetime, timezone
 class GreenscaleLogger:
    def __init__(self):
        import os
        self.greenscale_api_key = os.getenv("GREENSCALE_API_KEY")
        self.headers = {
            "api-key": self.greenscale_api_key,
-            "Content-Type": "application/json"
+            "Content-Type": "application/json",
        }
        self.greenscale_logging_url = os.getenv("GREENSCALE_ENDPOINT")
@ -19,13 +21,18 @@ class GreenscaleLogger:
            data = {
                "modelId": kwargs.get("model"),
                "inputTokenCount": response_json.get("usage", {}).get("prompt_tokens"),
-                "outputTokenCount": response_json.get("usage", {}).get("completion_tokens"),
+                "outputTokenCount": response_json.get("usage", {}).get(
                    "completion_tokens"
                ),
            }
-            data["timestamp"] = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
+            data["timestamp"] = datetime.now(timezone.utc).strftime(
                "%Y-%m-%dT%H:%M:%SZ"
            )
            if type(end_time) == datetime and type(start_time) == datetime:
-                data["invocationLatency"] = int((end_time - start_time).total_seconds() * 1000)
+                data["invocationLatency"] = int(
-
+                    (end_time - start_time).total_seconds() * 1000
                )
            # Add additional metadata keys to tags
            tags = []
@ -37,15 +44,25 @@ class GreenscaleLogger:
                    elif key == "greenscale_application":
                        data["application"] = value
                    else:
-                        tags.append({"key": key.replace("greenscale_", ""), "value": str(value)})
+                        tags.append(
                            {"key": key.replace("greenscale_", ""), "value": str(value)}
                        )
            data["tags"] = tags
-            response = requests.post(self.greenscale_logging_url, headers=self.headers, data=json.dumps(data, default=str))
+            response = requests.post(
                self.greenscale_logging_url,
                headers=self.headers,
                data=json.dumps(data, default=str),
            )
            if response.status_code != 200:
-                print_verbose(f"Greenscale Logger Error - {response.text}, {response.status_code}")
+                print_verbose(
                    f"Greenscale Logger Error - {response.text}, {response.status_code}"
                )
            else:
                print_verbose(f"Greenscale Logger Succeeded - {response.text}")
        except Exception as e:
-            print_verbose(f"Greenscale Logger Error - {e}, Stack trace: {traceback.format_exc()}")
+            print_verbose(
                f"Greenscale Logger Error - {e}, Stack trace: {traceback.format_exc()}"
            )
            pass
--- a/litellm/integrations/helicone.py
+++ b/litellm/integrations/helicone.py
@ -1,7 +1,7 @@
 #### What this does ####
 #    On success, logs events to Helicone
 import dotenv, os
-import requests
+import requests  # type: ignore
 import litellm
 dotenv.load_dotenv()  # Loading env variables using dotenv
--- a/litellm/integrations/langfuse.py
+++ b/litellm/integrations/langfuse.py
@ -262,6 +262,7 @@ class LangFuseLogger:
        try:
            tags = []
            metadata = copy.deepcopy(metadata)  # Avoid modifying the original metadata
            supports_tags = Version(langfuse.version.__version__) >= Version("2.6.3")
            supports_prompt = Version(langfuse.version.__version__) >= Version("2.7.3")
            supports_costs = Version(langfuse.version.__version__) >= Version("2.7.3")
@ -272,36 +273,9 @@ class LangFuseLogger:
            print_verbose(f"Langfuse Layer Logging - logging to langfuse v2 ")
            if supports_tags:
-                metadata_tags = metadata.get("tags", [])
+                metadata_tags = metadata.pop("tags", [])
                tags = metadata_tags
            trace_name = metadata.get("trace_name", None)
            trace_id = metadata.get("trace_id", None)
            existing_trace_id = metadata.get("existing_trace_id", None)
            if trace_name is None and existing_trace_id is None:
                # just log `litellm-{call_type}` as the trace name
                ## DO NOT SET TRACE_NAME if trace-id set. this can lead to overwriting of past traces.
                trace_name = f"litellm-{kwargs.get('call_type', 'completion')}"
            if existing_trace_id is not None:
                trace_params = {"id": existing_trace_id}
            else:  # don't overwrite an existing trace
                trace_params = {
                    "name": trace_name,
                    "input": input,
                    "user_id": metadata.get("trace_user_id", user_id),
                    "id": trace_id,
                    "session_id": metadata.get("session_id", None),
                }
                if level == "ERROR":
                    trace_params["status_message"] = output
                else:
                    trace_params["output"] = output
            cost = kwargs.get("response_cost", None)
            print_verbose(f"trace: {cost}")
            # Clean Metadata before logging - never log raw metadata
            # the raw metadata can contain circular references which leads to infinite recursion
            # we clean out all extra litellm metadata params before logging
@ -328,6 +302,66 @@ class LangFuseLogger:
                    else:
                        clean_metadata[key] = value
            session_id = clean_metadata.pop("session_id", None)
            trace_name = clean_metadata.pop("trace_name", None)
            trace_id = clean_metadata.pop("trace_id", None)
            existing_trace_id = clean_metadata.pop("existing_trace_id", None)
            update_trace_keys = clean_metadata.pop("update_trace_keys", [])
            if trace_name is None and existing_trace_id is None:
                # just log `litellm-{call_type}` as the trace name
                ## DO NOT SET TRACE_NAME if trace-id set. this can lead to overwriting of past traces.
                trace_name = f"litellm-{kwargs.get('call_type', 'completion')}"
            if existing_trace_id is not None:
                trace_params = {"id": existing_trace_id}
                # Update the following keys for this trace
                for metadata_param_key in update_trace_keys:
                    trace_param_key = metadata_param_key.replace("trace_", "")
                    if trace_param_key not in trace_params:
                        updated_trace_value = clean_metadata.pop(
                            metadata_param_key, None
                        )
                        if updated_trace_value is not None:
                            trace_params[trace_param_key] = updated_trace_value
                # Pop the trace specific keys that would have been popped if there were a new trace
                for key in list(
                    filter(lambda key: key.startswith("trace_"), clean_metadata.keys())
                ):
                    clean_metadata.pop(key, None)
                # Special keys that are found in the function arguments and not the metadata
                if "input" in update_trace_keys:
                    trace_params["input"] = input
                if "output" in update_trace_keys:
                    trace_params["output"] = output
            else:  # don't overwrite an existing trace
                trace_params = {
                    "id": trace_id,
                    "name": trace_name,
                    "session_id": session_id,
                    "input": input,
                    "version": clean_metadata.pop(
                        "trace_version", clean_metadata.get("version", None)
                    ),  # If provided just version, it will applied to the trace as well, if applied a trace version it will take precedence
                }
                for key in list(
                    filter(lambda key: key.startswith("trace_"), clean_metadata.keys())
                ):
                    trace_params[key.replace("trace_", "")] = clean_metadata.pop(
                        key, None
                    )
                if level == "ERROR":
                    trace_params["status_message"] = output
                else:
                    trace_params["output"] = output
            cost = kwargs.get("response_cost", None)
            print_verbose(f"trace: {cost}")
            if (
                litellm._langfuse_default_tags is not None
                and isinstance(litellm._langfuse_default_tags, list)
@ -387,7 +421,7 @@ class LangFuseLogger:
                    "completion_tokens": response_obj["usage"]["completion_tokens"],
                    "total_cost": cost if supports_costs else None,
                }
-            generation_name = metadata.get("generation_name", None)
+            generation_name = clean_metadata.pop("generation_name", None)
            if generation_name is None:
                # just log `litellm-{call_type}` as the generation name
                generation_name = f"litellm-{kwargs.get('call_type', 'completion')}"
@ -402,7 +436,7 @@ class LangFuseLogger:
            generation_params = {
                "name": generation_name,
-                "id": metadata.get("generation_id", generation_id),
+                "id": clean_metadata.pop("generation_id", generation_id),
                "start_time": start_time,
                "end_time": end_time,
                "model": kwargs["model"],
@ -412,10 +446,11 @@ class LangFuseLogger:
                "usage": usage,
                "metadata": clean_metadata,
                "level": level,
                "version": clean_metadata.pop("version", None),
            }
            if supports_prompt:
-                generation_params["prompt"] = metadata.get("prompt", None)
+                generation_params["prompt"] = clean_metadata.pop("prompt", None)
            if output is not None and isinstance(output, str) and level == "ERROR":
                generation_params["status_message"] = output
--- a/litellm/integrations/langsmith.py
+++ b/litellm/integrations/langsmith.py
@ -1,15 +1,14 @@
 #### What this does ####
 #    On success, logs events to Langsmith
-import dotenv, os
+import dotenv, os  # type: ignore
-import requests
+import requests  # type: ignore
 import requests
 from datetime import datetime
 dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
 import asyncio
 import types
-from pydantic import BaseModel
+from pydantic import BaseModel  # type: ignore
 def is_serializable(value):
@ -79,8 +78,6 @@ class LangsmithLogger:
                except:
                    response_obj = response_obj.dict()  # type: ignore
            print(f"response_obj: {response_obj}")
            data = {
                "name": run_name,
                "run_type": "llm",  # this should always be llm, since litellm always logs llm calls. Langsmith allow us to log "chain"
@ -90,7 +87,6 @@ class LangsmithLogger:
                "start_time": start_time,
                "end_time": end_time,
            }
            print(f"data: {data}")
            response = requests.post(
                "https://api.smith.langchain.com/runs",
--- a/litellm/integrations/openmeter.py
+++ b/litellm/integrations/openmeter.py
@ -2,7 +2,6 @@
 ## On Success events log cost to OpenMeter - https://github.com/BerriAI/litellm/issues/1268
 import dotenv, os, json
 import requests
 import litellm
 dotenv.load_dotenv()  # Loading env variables using dotenv
@ -60,7 +59,7 @@ class OpenMeterLogger(CustomLogger):
                "total_tokens": response_obj["usage"].get("total_tokens"),
            }
-        subject = kwargs.get("user", None),  # end-user passed in via 'user' param
+        subject = (kwargs.get("user", None),)  # end-user passed in via 'user' param
        if not subject:
            raise Exception("OpenMeter: user is required")
--- a/litellm/integrations/prometheus.py
+++ b/litellm/integrations/prometheus.py
@ -3,7 +3,7 @@
 #    On success, log events to Prometheus
 import dotenv, os
-import requests
+import requests  # type: ignore
 dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
@ -19,7 +19,6 @@ class PrometheusLogger:
        **kwargs,
    ):
        try:
            print(f"in init prometheus metrics")
            from prometheus_client import Counter
            self.litellm_llm_api_failed_requests_metric = Counter(
--- a/litellm/integrations/prometheus_services.py
+++ b/litellm/integrations/prometheus_services.py
@ -4,7 +4,7 @@
 import dotenv, os
-import requests
+import requests  # type: ignore
 dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
@ -183,7 +183,6 @@ class PrometheusServicesLogger:
                    )
    async def async_service_failure_hook(self, payload: ServiceLoggerPayload):
        print(f"received error payload: {payload.error}")
        if self.mock_testing:
            self.mock_testing_failure_calls += 1
--- a/litellm/integrations/prompt_layer.py
+++ b/litellm/integrations/prompt_layer.py
@ -1,12 +1,13 @@
 #### What this does ####
 #    On success, logs events to Promptlayer
 import dotenv, os
-import requests
+import requests  # type: ignore
 from pydantic import BaseModel
 dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
 class PromptLayerLogger:
    # Class variables or attributes
    def __init__(self):
@ -32,7 +33,11 @@ class PromptLayerLogger:
                    tags = kwargs["litellm_params"]["metadata"]["pl_tags"]
                # Remove "pl_tags" from metadata
-                metadata = {k:v for k, v in kwargs["litellm_params"]["metadata"].items() if k != "pl_tags"}
+                metadata = {
                    k: v
                    for k, v in kwargs["litellm_params"]["metadata"].items()
                    if k != "pl_tags"
                }
            print_verbose(
                f"Prompt Layer Logging - Enters logging function for model kwargs: {new_kwargs}\n, response: {response_obj}"
--- a/litellm/integrations/s3.py
+++ b/litellm/integrations/s3.py
@ -2,7 +2,6 @@
 #    On success + failure, log events to Supabase
 import dotenv, os
 import requests
 dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
--- a/litellm/integrations/slack_alerting.py
+++ b/litellm/integrations/slack_alerting.py
@ -1,25 +1,82 @@
 #### What this does ####
 #    Class for sending Slack Alerts #
 import dotenv, os
 from litellm.proxy._types import UserAPIKeyAuth
 dotenv.load_dotenv()  # Loading env variables using dotenv
 import copy
 import traceback
 from litellm._logging import verbose_logger, verbose_proxy_logger
-import litellm
+import litellm, threading
 from typing import List, Literal, Any, Union, Optional, Dict
 from litellm.caching import DualCache
 import asyncio
 import aiohttp
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
 import datetime
 from pydantic import BaseModel
 from enum import Enum
 from datetime import datetime as dt, timedelta
 from litellm.integrations.custom_logger import CustomLogger
 import random
-class SlackAlerting:
+class LiteLLMBase(BaseModel):
    """
    Implements default functions, all pydantic objects should have.
    """
    def json(self, **kwargs):
        try:
            return self.model_dump()  # noqa
        except:
            # if using pydantic v1
            return self.dict()
 class SlackAlertingArgs(LiteLLMBase):
    daily_report_frequency: int = 12 * 60 * 60  # 12 hours
    report_check_interval: int = 5 * 60  # 5 minutes
 class DeploymentMetrics(LiteLLMBase):
    """
    Metrics per deployment, stored in cache
    Used for daily reporting
    """
    id: str
    """id of deployment in router model list"""
    failed_request: bool
    """did it fail the request?"""
    latency_per_output_token: Optional[float]
    """latency/output token of deployment"""
    updated_at: dt
    """Current time of deployment being updated"""
 class SlackAlertingCacheKeys(Enum):
    """
    Enum for deployment daily metrics keys - {deployment_id}:{enum}
    """
    failed_requests_key = "failed_requests_daily_metrics"
    latency_key = "latency_daily_metrics"
    report_sent_key = "daily_metrics_report_sent"
 class SlackAlerting(CustomLogger):
    """
    Class for sending Slack Alerts
    """
    # Class variables or attributes
    def __init__(
        self,
-        alerting_threshold: float = 300,
+        internal_usage_cache: Optional[DualCache] = None,
        alerting_threshold: float = 300,  # threshold for slow / hanging llm responses (in seconds)
        alerting: Optional[List] = [],
        alert_types: Optional[
            List[
@ -29,6 +86,7 @@ class SlackAlerting:
                    "llm_requests_hanging",
                    "budget_alerts",
                    "db_exceptions",
                    "daily_reports",
                ]
            ]
        ] = [
@ -37,18 +95,23 @@ class SlackAlerting:
            "llm_requests_hanging",
            "budget_alerts",
            "db_exceptions",
            "daily_reports",
        ],
        alert_to_webhook_url: Optional[
            Dict
        ] = None,  # if user wants to separate alerts to diff channels
        alerting_args={},
        default_webhook_url: Optional[str] = None,
    ):
        self.alerting_threshold = alerting_threshold
        self.alerting = alerting
        self.alert_types = alert_types
-        self.internal_usage_cache = DualCache()
+        self.internal_usage_cache = internal_usage_cache or DualCache()
        self.async_http_handler = AsyncHTTPHandler()
        self.alert_to_webhook_url = alert_to_webhook_url
-        pass
+        self.is_running = False
        self.alerting_args = SlackAlertingArgs(**alerting_args)
        self.default_webhook_url = default_webhook_url
    def update_values(
        self,
@ -56,6 +119,7 @@ class SlackAlerting:
        alerting_threshold: Optional[float] = None,
        alert_types: Optional[List] = None,
        alert_to_webhook_url: Optional[Dict] = None,
        alerting_args: Optional[Dict] = None,
    ):
        if alerting is not None:
            self.alerting = alerting
@ -63,7 +127,8 @@ class SlackAlerting:
            self.alerting_threshold = alerting_threshold
        if alert_types is not None:
            self.alert_types = alert_types
-
+        if alerting_args is not None:
            self.alerting_args = SlackAlertingArgs(**alerting_args)
        if alert_to_webhook_url is not None:
            # update the dict
            if self.alert_to_webhook_url is None:
@ -90,18 +155,23 @@ class SlackAlerting:
    def _add_langfuse_trace_id_to_alert(
        self,
        request_info: str,
        request_data: Optional[dict] = None,
-        kwargs: Optional[dict] = None,
+    ) -> Optional[str]:
-        type: Literal["hanging_request", "slow_response"] = "hanging_request",
+        """
-        start_time: Optional[datetime.datetime] = None,
+        Returns langfuse trace url
-        end_time: Optional[datetime.datetime] = None,
+        """
    ):
        # do nothing for now
-        pass
+        if (
-        return request_info
+            request_data is not None
            and request_data.get("metadata", {}).get("trace_id", None) is not None
        ):
            trace_id = request_data["metadata"]["trace_id"]
            if litellm.utils.langFuseLogger is not None:
                base_url = litellm.utils.langFuseLogger.Langfuse.base_url
                return f"{base_url}/trace/{trace_id}"
        return None
-    def _response_taking_too_long_callback(
+    def _response_taking_too_long_callback_helper(
        self,
        kwargs,  # kwargs to completion
        start_time,
@ -166,7 +236,7 @@ class SlackAlerting:
            return
        time_difference_float, model, api_base, messages = (
-            self._response_taking_too_long_callback(
+            self._response_taking_too_long_callback_helper(
                kwargs=kwargs,
                start_time=start_time,
                end_time=end_time,
@ -182,6 +252,9 @@ class SlackAlerting:
                and "metadata" in kwargs["litellm_params"]
            ):
                _metadata = kwargs["litellm_params"]["metadata"]
                request_info = litellm.utils._add_key_name_and_team_to_alert(
                    request_info=request_info, metadata=_metadata
                )
                _deployment_latency_map = self._get_deployment_latencies_to_alert(
                    metadata=_metadata
@ -196,8 +269,178 @@ class SlackAlerting:
                alert_type="llm_too_slow",
            )
-    async def log_failure_event(self, original_exception: Exception):
+    async def async_update_daily_reports(
-        pass
+        self, deployment_metrics: DeploymentMetrics
    ) -> int:
        """
        Store the perf by deployment in cache
        - Number of failed requests per deployment
        - Latency / output tokens per deployment
        'deployment_id:daily_metrics:failed_requests'
        'deployment_id:daily_metrics:latency_per_output_token'
        Returns
            int - count of metrics set (1 - if just latency, 2 - if failed + latency)
        """
        return_val = 0
        try:
            ## FAILED REQUESTS ##
            if deployment_metrics.failed_request:
                await self.internal_usage_cache.async_increment_cache(
                    key="{}:{}".format(
                        deployment_metrics.id,
                        SlackAlertingCacheKeys.failed_requests_key.value,
                    ),
                    value=1,
                )
                return_val += 1
            ## LATENCY ##
            if deployment_metrics.latency_per_output_token is not None:
                await self.internal_usage_cache.async_increment_cache(
                    key="{}:{}".format(
                        deployment_metrics.id, SlackAlertingCacheKeys.latency_key.value
                    ),
                    value=deployment_metrics.latency_per_output_token,
                )
                return_val += 1
            return return_val
        except Exception as e:
            return 0
    async def send_daily_reports(self, router) -> bool:
        """
        Send a daily report on:
        - Top 5 deployments with most failed requests
        - Top 5 slowest deployments (normalized by latency/output tokens)
        Get the value from redis cache (if available) or in-memory and send it
        Cleanup:
        - reset values in cache -> prevent memory leak
        Returns:
            True -> if successfuly sent
            False -> if not sent
        """
        ids = router.get_model_ids()
        # get keys
        failed_request_keys = [
            "{}:{}".format(id, SlackAlertingCacheKeys.failed_requests_key.value)
            for id in ids
        ]
        latency_keys = [
            "{}:{}".format(id, SlackAlertingCacheKeys.latency_key.value) for id in ids
        ]
        combined_metrics_keys = failed_request_keys + latency_keys  # reduce cache calls
        combined_metrics_values = await self.internal_usage_cache.async_batch_get_cache(
            keys=combined_metrics_keys
        )  # [1, 2, None, ..]
        all_none = True
        for val in combined_metrics_values:
            if val is not None:
                all_none = False
        if all_none:
            return False
        failed_request_values = combined_metrics_values[
            : len(failed_request_keys)
        ]  # # [1, 2, None, ..]
        latency_values = combined_metrics_values[len(failed_request_keys) :]
        # find top 5 failed
        ## Replace None values with a placeholder value (-1 in this case)
        placeholder_value = 0
        replaced_failed_values = [
            value if value is not None else placeholder_value
            for value in failed_request_values
        ]
        ## Get the indices of top 5 keys with the highest numerical values (ignoring None values)
        top_5_failed = sorted(
            range(len(replaced_failed_values)),
            key=lambda i: replaced_failed_values[i],
            reverse=True,
        )[:5]
        # find top 5 slowest
        # Replace None values with a placeholder value (-1 in this case)
        placeholder_value = 0
        replaced_slowest_values = [
            value if value is not None else placeholder_value
            for value in latency_values
        ]
        # Get the indices of top 5 values with the highest numerical values (ignoring None values)
        top_5_slowest = sorted(
            range(len(replaced_slowest_values)),
            key=lambda i: replaced_slowest_values[i],
            reverse=True,
        )[:5]
        # format alert -> return the litellm model name + api base
        message = f"\n\nHere are today's key metrics 📈: \n\n"
        message += "\n\n*❗️ Top 5 Deployments with Most Failed Requests:*\n\n"
        for i in range(len(top_5_failed)):
            key = failed_request_keys[top_5_failed[i]].split(":")[0]
            _deployment = router.get_model_info(key)
            if isinstance(_deployment, dict):
                deployment_name = _deployment["litellm_params"].get("model", "")
            else:
                return False
            api_base = litellm.get_api_base(
                model=deployment_name,
                optional_params=(
                    _deployment["litellm_params"] if _deployment is not None else {}
                ),
            )
            if api_base is None:
                api_base = ""
            value = replaced_failed_values[top_5_failed[i]]
            message += f"\t{i+1}. Deployment: `{deployment_name}`, Failed Requests: `{value}`,  API Base: `{api_base}`\n"
        message += "\n\n*😅 Top 5 Slowest Deployments:*\n\n"
        for i in range(len(top_5_slowest)):
            key = latency_keys[top_5_slowest[i]].split(":")[0]
            _deployment = router.get_model_info(key)
            if _deployment is not None:
                deployment_name = _deployment["litellm_params"].get("model", "")
            else:
                deployment_name = ""
            api_base = litellm.get_api_base(
                model=deployment_name,
                optional_params=(
                    _deployment["litellm_params"] if _deployment is not None else {}
                ),
            )
            value = round(replaced_slowest_values[top_5_slowest[i]], 3)
            message += f"\t{i+1}. Deployment: `{deployment_name}`, Latency per output token: `{value}s/token`,  API Base: `{api_base}`\n\n"
        # cache cleanup -> reset values to 0
        latency_cache_keys = [(key, 0) for key in latency_keys]
        failed_request_cache_keys = [(key, 0) for key in failed_request_keys]
        combined_metrics_cache_keys = latency_cache_keys + failed_request_cache_keys
        await self.internal_usage_cache.async_batch_set_cache(
            cache_list=combined_metrics_cache_keys
        )
        # send alert
        await self.send_alert(message=message, level="Low", alert_type="daily_reports")
        return True
    async def response_taking_too_long(
        self,
@ -255,6 +498,11 @@ class SlackAlerting:
                    # in that case we fallback to the api base set in the request metadata
                    _metadata = request_data["metadata"]
                    _api_base = _metadata.get("api_base", "")
                    request_info = litellm.utils._add_key_name_and_team_to_alert(
                        request_info=request_info, metadata=_metadata
                    )
                    if _api_base is None:
                        _api_base = ""
                    request_info += f"\nAPI Base: `{_api_base}`"
@ -264,14 +512,13 @@ class SlackAlerting:
                )
                if "langfuse" in litellm.success_callback:
-                    request_info = self._add_langfuse_trace_id_to_alert(
+                    langfuse_url = self._add_langfuse_trace_id_to_alert(
                        request_info=request_info,
                        request_data=request_data,
                        type="hanging_request",
                        start_time=start_time,
                        end_time=end_time,
                    )
                    if langfuse_url is not None:
                        request_info += "\n🪢 Langfuse Trace: {}".format(langfuse_url)
                # add deployment latencies to alert
                _deployment_latency_map = self._get_deployment_latencies_to_alert(
                    metadata=request_data.get("metadata", {})
@ -404,6 +651,53 @@ class SlackAlerting:
        return
    async def model_added_alert(self, model_name: str, litellm_model_name: str):
        model_info = litellm.model_cost.get(litellm_model_name, {})
        model_info_str = ""
        for k, v in model_info.items():
            if k == "input_cost_per_token" or k == "output_cost_per_token":
                # when converting to string it should not be 1.63e-06
                v = "{:.8f}".format(v)
            model_info_str += f"{k}: {v}\n"
        message = f"""
 *🚅 New Model Added*
 Model Name: `{model_name}`
 Usage OpenAI Python SDK:
 ```
 import openai
 client = openai.OpenAI(
    api_key="your_api_key",
    base_url={os.getenv("PROXY_BASE_URL", "http://0.0.0.0:4000")}
 )
 response = client.chat.completions.create(
    model="{model_name}", # model to send to the proxy
    messages = [
        {{
            "role": "user",
            "content": "this is a test request, write a short poem"
        }}
    ]
 )
 ```
 Model Info: 
 ```
 {model_info_str}
 ```
 """
        await self.send_alert(
            message=message, level="Low", alert_type="new_model_added"
        )
        pass
    async def model_removed_alert(self, model_name: str):
        pass
    async def send_alert(
        self,
        message: str,
@ -414,7 +708,11 @@ class SlackAlerting:
            "llm_requests_hanging",
            "budget_alerts",
            "db_exceptions",
            "daily_reports",
            "new_model_added",
            "cooldown_deployment",
        ],
        **kwargs,
    ):
        """
        Alerting based on thresholds: - https://github.com/BerriAI/litellm/issues/1298
@ -439,9 +737,16 @@ class SlackAlerting:
        # Get the current timestamp
        current_time = datetime.now().strftime("%H:%M:%S")
        _proxy_base_url = os.getenv("PROXY_BASE_URL", None)
        if alert_type == "daily_reports" or alert_type == "new_model_added":
            formatted_message = message
        else:
            formatted_message = (
                f"Level: `{level}`\nTimestamp: `{current_time}`\n\nMessage: {message}"
            )
        if kwargs:
            for key, value in kwargs.items():
                formatted_message += f"\n\n{key}: `{value}`\n\n"
        if _proxy_base_url is not None:
            formatted_message += f"\n\nProxy URL: `{_proxy_base_url}`"
@ -451,6 +756,8 @@ class SlackAlerting:
            and alert_type in self.alert_to_webhook_url
        ):
            slack_webhook_url = self.alert_to_webhook_url[alert_type]
        elif self.default_webhook_url is not None:
            slack_webhook_url = self.default_webhook_url
        else:
            slack_webhook_url = os.getenv("SLACK_WEBHOOK_URL", None)
@ -468,3 +775,113 @@ class SlackAlerting:
            pass
        else:
            print("Error sending slack alert. Error=", response.text)  # noqa
    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
        """Log deployment latency"""
        if "daily_reports" in self.alert_types:
            model_id = (
                kwargs.get("litellm_params", {}).get("model_info", {}).get("id", "")
            )
            response_s: timedelta = end_time - start_time
            final_value = response_s
            total_tokens = 0
            if isinstance(response_obj, litellm.ModelResponse):
                completion_tokens = response_obj.usage.completion_tokens
                final_value = float(response_s.total_seconds() / completion_tokens)
            await self.async_update_daily_reports(
                DeploymentMetrics(
                    id=model_id,
                    failed_request=False,
                    latency_per_output_token=final_value,
                    updated_at=litellm.utils.get_utc_datetime(),
                )
            )
    async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
        """Log failure + deployment latency"""
        if "daily_reports" in self.alert_types:
            model_id = (
                kwargs.get("litellm_params", {}).get("model_info", {}).get("id", "")
            )
            await self.async_update_daily_reports(
                DeploymentMetrics(
                    id=model_id,
                    failed_request=True,
                    latency_per_output_token=None,
                    updated_at=litellm.utils.get_utc_datetime(),
                )
            )
        if "llm_exceptions" in self.alert_types:
            original_exception = kwargs.get("exception", None)
            await self.send_alert(
                message="LLM API Failure - " + str(original_exception),
                level="High",
                alert_type="llm_exceptions",
            )
    async def _run_scheduler_helper(self, llm_router) -> bool:
        """
        Returns:
        - True -> report sent
        - False -> report not sent
        """
        report_sent_bool = False
        report_sent = await self.internal_usage_cache.async_get_cache(
            key=SlackAlertingCacheKeys.report_sent_key.value
        )  # None | datetime
        current_time = litellm.utils.get_utc_datetime()
        if report_sent is None:
            _current_time = current_time.isoformat()
            await self.internal_usage_cache.async_set_cache(
                key=SlackAlertingCacheKeys.report_sent_key.value,
                value=_current_time,
            )
        else:
            # check if current time - interval >= time last sent
            delta = current_time - timedelta(
                seconds=self.alerting_args.daily_report_frequency
            )
            if isinstance(report_sent, str):
                report_sent = dt.fromisoformat(report_sent)
            if delta >= report_sent:
                # Sneak in the reporting logic here
                await self.send_daily_reports(router=llm_router)
                # Also, don't forget to update the report_sent time after sending the report!
                _current_time = current_time.isoformat()
                await self.internal_usage_cache.async_set_cache(
                    key=SlackAlertingCacheKeys.report_sent_key.value,
                    value=_current_time,
                )
                report_sent_bool = True
        return report_sent_bool
    async def _run_scheduled_daily_report(self, llm_router: Optional[Any] = None):
        """
        If 'daily_reports' enabled
        Ping redis cache every 5 minutes to check if we should send the report
        If yes -> call send_daily_report()
        """
        if llm_router is None or self.alert_types is None:
            return
        if "daily_reports" in self.alert_types:
            while True:
                await self._run_scheduler_helper(llm_router=llm_router)
                interval = random.randint(
                    self.alerting_args.report_check_interval - 3,
                    self.alerting_args.report_check_interval + 3,
                )  # shuffle to prevent collisions
                await asyncio.sleep(interval)
        return
--- a/litellm/integrations/supabase.py
+++ b/litellm/integrations/supabase.py
@ -2,7 +2,7 @@
 #    On success + failure, log events to Supabase
 import dotenv, os
-import requests
+import requests  # type: ignore
 dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
--- a/litellm/llms/ai21.py
+++ b/litellm/llms/ai21.py
@ -1,8 +1,8 @@
 import os, types, traceback
 import json
 from enum import Enum
-import requests
+import requests  # type: ignore
-import time, httpx
+import time, httpx  # type: ignore
 from typing import Callable, Optional
 from litellm.utils import ModelResponse, Choices, Message
 import litellm
--- a/litellm/llms/aleph_alpha.py
+++ b/litellm/llms/aleph_alpha.py
@ -1,12 +1,12 @@
 import os, types
 import json
 from enum import Enum
-import requests
+import requests  # type: ignore
 import time
 from typing import Callable, Optional
 import litellm
 from litellm.utils import ModelResponse, Choices, Message, Usage
-import httpx
+import httpx  # type: ignore
 class AlephAlphaError(Exception):
--- a/litellm/llms/anthropic.py
+++ b/litellm/llms/anthropic.py
@ -1,7 +1,7 @@
 import os, types
 import json
 from enum import Enum
-import requests, copy
+import requests, copy  # type: ignore
 import time
 from typing import Callable, Optional, List
 from litellm.utils import ModelResponse, Usage, map_finish_reason, CustomStreamWrapper
@ -9,7 +9,7 @@ import litellm
 from .prompt_templates.factory import prompt_factory, custom_prompt
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
 from .base import BaseLLM
-import httpx
+import httpx  # type: ignore
 class AnthropicConstants(Enum):
@ -184,11 +184,6 @@ class AnthropicChatCompletion(BaseLLM):
                message=str(completion_response["error"]),
                status_code=response.status_code,
            )
        elif len(completion_response["content"]) == 0:
            raise AnthropicError(
                message="No content in response",
                status_code=500,
            )
        else:
            text_content = ""
            tool_calls = []
--- a/litellm/llms/azure.py
+++ b/litellm/llms/azure.py
@ -1,4 +1,4 @@
-from typing import Optional, Union, Any
+from typing import Optional, Union, Any, Literal
 import types, requests
 from .base import BaseLLM
 from litellm.utils import (
@ -12,7 +12,7 @@ from litellm.utils import (
 from typing import Callable, Optional, BinaryIO
 from litellm import OpenAIConfig
 import litellm, json
-import httpx
+import httpx  # type: ignore
 from .custom_httpx.azure_dall_e_2 import CustomHTTPTransport, AsyncCustomHTTPTransport
 from openai import AzureOpenAI, AsyncAzureOpenAI
 import uuid
@ -952,6 +952,81 @@ class AzureChatCompletion(BaseLLM):
            )
            raise e
    def get_headers(
        self,
        model: Optional[str],
        api_key: str,
        api_base: str,
        api_version: str,
        timeout: float,
        mode: str,
        messages: Optional[list] = None,
        input: Optional[list] = None,
        prompt: Optional[str] = None,
    ) -> dict:
        client_session = litellm.client_session or httpx.Client(
            transport=CustomHTTPTransport(),  # handle dall-e-2 calls
        )
        if "gateway.ai.cloudflare.com" in api_base:
            ## build base url - assume api base includes resource name
            if not api_base.endswith("/"):
                api_base += "/"
            api_base += f"{model}"
            client = AzureOpenAI(
                base_url=api_base,
                api_version=api_version,
                api_key=api_key,
                timeout=timeout,
                http_client=client_session,
            )
            model = None
            # cloudflare ai gateway, needs model=None
        else:
            client = AzureOpenAI(
                api_version=api_version,
                azure_endpoint=api_base,
                api_key=api_key,
                timeout=timeout,
                http_client=client_session,
            )
            # only run this check if it's not cloudflare ai gateway
            if model is None and mode != "image_generation":
                raise Exception("model is not set")
        completion = None
        if messages is None:
            messages = [{"role": "user", "content": "Hey"}]
        try:
            completion = client.chat.completions.with_raw_response.create(
                model=model,  # type: ignore
                messages=messages,  # type: ignore
            )
        except Exception as e:
            raise e
        response = {}
        if completion is None or not hasattr(completion, "headers"):
            raise Exception("invalid completion response")
        if (
            completion.headers.get("x-ratelimit-remaining-requests", None) is not None
        ):  # not provided for dall-e requests
            response["x-ratelimit-remaining-requests"] = completion.headers[
                "x-ratelimit-remaining-requests"
            ]
        if completion.headers.get("x-ratelimit-remaining-tokens", None) is not None:
            response["x-ratelimit-remaining-tokens"] = completion.headers[
                "x-ratelimit-remaining-tokens"
            ]
        if completion.headers.get("x-ms-region", None) is not None:
            response["x-ms-region"] = completion.headers["x-ms-region"]
        return response
    async def ahealth_check(
        self,
        model: Optional[str],
@ -963,7 +1038,7 @@ class AzureChatCompletion(BaseLLM):
        messages: Optional[list] = None,
        input: Optional[list] = None,
        prompt: Optional[str] = None,
-    ):
+    ) -> dict:
        client_session = litellm.aclient_session or httpx.AsyncClient(
            transport=AsyncCustomHTTPTransport(),  # handle dall-e-2 calls
        )
@ -1040,4 +1115,8 @@ class AzureChatCompletion(BaseLLM):
            response["x-ratelimit-remaining-tokens"] = completion.headers[
                "x-ratelimit-remaining-tokens"
            ]
        if completion.headers.get("x-ms-region", None) is not None:
            response["x-ms-region"] = completion.headers["x-ms-region"]
        return response
--- a/litellm/llms/azure_text.py
+++ b/litellm/llms/azure_text.py
@ -1,5 +1,5 @@
 from typing import Optional, Union, Any
-import types, requests
+import types, requests  # type: ignore
 from .base import BaseLLM
 from litellm.utils import (
    ModelResponse,
--- a/litellm/llms/baseten.py
+++ b/litellm/llms/baseten.py
@ -1,7 +1,7 @@
 import os
 import json
 from enum import Enum
-import requests
+import requests  # type: ignore
 import time
 from typing import Callable
 from litellm.utils import ModelResponse, Usage
--- a/litellm/llms/bedrock.py
+++ b/litellm/llms/bedrock.py
@ -163,10 +163,9 @@ class AmazonAnthropicClaude3Config:
            "stop",
            "temperature",
            "top_p",
-            "extra_headers"
+            "extra_headers",
        ]
    def map_openai_params(self, non_default_params: dict, optional_params: dict):
        for param, value in non_default_params.items():
            if param == "max_tokens":
@ -534,10 +533,12 @@ class AmazonStabilityConfig:
 def add_custom_header(headers):
    """Closure to capture the headers and add them."""
    def callback(request, **kwargs):
        """Actual callback function that Boto3 will call."""
        for header_name, header_value in headers.items():
            request.headers.add_header(header_name, header_value)
    return callback
@ -672,7 +673,9 @@ def init_bedrock_client(
            config=config,
        )
    if extra_headers:
-        client.meta.events.register('before-sign.bedrock-runtime.*', add_custom_header(extra_headers))
+        client.meta.events.register(
            "before-sign.bedrock-runtime.*", add_custom_header(extra_headers)
        )
    return client
@ -1224,7 +1227,7 @@ def _embedding_func_single(
            "input_type", "search_document"
        )  # aws bedrock example default - https://us-east-1.console.aws.amazon.com/bedrock/home?region=us-east-1#/providers?model=cohere.embed-english-v3
        data = {"texts": [input], **inference_params}  # type: ignore
-    body = json.dumps(data).encode("utf-8")
+    body = json.dumps(data).encode("utf-8")  # type: ignore
    ## LOGGING
    request_str = f"""
    response = client.invoke_model(
@ -1416,7 +1419,7 @@ def image_generation(
    ## LOGGING
    request_str = f"""
    response = client.invoke_model(
-        body={body},
+        body={body}, # type: ignore
        modelId={modelId},
        accept="application/json",
        contentType="application/json",
--- a/litellm/llms/cloudflare.py
+++ b/litellm/llms/cloudflare.py
@ -1,11 +1,11 @@
 import os, types
 import json
 from enum import Enum
-import requests
+import requests  # type: ignore
 import time
 from typing import Callable, Optional
 import litellm
-import httpx
+import httpx  # type: ignore
 from litellm.utils import ModelResponse, Usage
 from .prompt_templates.factory import prompt_factory, custom_prompt
--- a/litellm/llms/cohere.py
+++ b/litellm/llms/cohere.py
@ -1,12 +1,12 @@
 import os, types
 import json
 from enum import Enum
-import requests
+import requests  # type: ignore
 import time, traceback
 from typing import Callable, Optional
 from litellm.utils import ModelResponse, Choices, Message, Usage
 import litellm
-import httpx
+import httpx  # type: ignore
 class CohereError(Exception):
--- a/litellm/llms/cohere_chat.py
+++ b/litellm/llms/cohere_chat.py
@ -1,12 +1,12 @@
 import os, types
 import json
 from enum import Enum
-import requests
+import requests  # type: ignore
 import time, traceback
 from typing import Callable, Optional
 from litellm.utils import ModelResponse, Choices, Message, Usage
 import litellm
-import httpx
+import httpx  # type: ignore
 from .prompt_templates.factory import cohere_message_pt
--- a/litellm/llms/maritalk.py
+++ b/litellm/llms/maritalk.py
@ -1,7 +1,7 @@
 import os, types
 import json
 from enum import Enum
-import requests
+import requests  # type: ignore
 import time, traceback
 from typing import Callable, Optional, List
 from litellm.utils import ModelResponse, Choices, Message, Usage
--- a/litellm/llms/nlp_cloud.py
+++ b/litellm/llms/nlp_cloud.py
@ -1,7 +1,7 @@
 import os, types
 import json
 from enum import Enum
-import requests
+import requests  # type: ignore
 import time
 from typing import Callable, Optional
 import litellm
--- a/litellm/llms/ollama.py
+++ b/litellm/llms/ollama.py
@ -1,10 +1,10 @@
 from itertools import chain
-import requests, types, time
+import requests, types, time  # type: ignore
 import json, uuid
 import traceback
 from typing import Optional
 import litellm
-import httpx, aiohttp, asyncio
+import httpx, aiohttp, asyncio  # type: ignore
 from .prompt_templates.factory import prompt_factory, custom_prompt
@ -220,7 +220,10 @@ def get_ollama_response(
            tool_calls=[
                {
                    "id": f"call_{str(uuid.uuid4())}",
-                    "function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])},
+                    "function": {
                        "name": function_call["name"],
                        "arguments": json.dumps(function_call["arguments"]),
                    },
                    "type": "function",
                }
            ],
@ -232,7 +235,9 @@ def get_ollama_response(
    model_response["created"] = int(time.time())
    model_response["model"] = "ollama/" + model
    prompt_tokens = response_json.get("prompt_eval_count", len(encoding.encode(prompt, disallowed_special=())))  # type: ignore
-    completion_tokens = response_json.get("eval_count", len(response_json.get("message",dict()).get("content", "")))
+    completion_tokens = response_json.get(
        "eval_count", len(response_json.get("message", dict()).get("content", ""))
    )
    model_response["usage"] = litellm.Usage(
        prompt_tokens=prompt_tokens,
        completion_tokens=completion_tokens,
@ -273,7 +278,10 @@ def ollama_completion_stream(url, data, logging_obj):
                    tool_calls=[
                        {
                            "id": f"call_{str(uuid.uuid4())}",
-                            "function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])},
+                            "function": {
                                "name": function_call["name"],
                                "arguments": json.dumps(function_call["arguments"]),
                            },
                            "type": "function",
                        }
                    ],
@ -316,7 +324,8 @@ async def ollama_async_streaming(url, data, model_response, encoding, logging_ob
                    [
                        chunk.choices[0].delta.content
                        async for chunk in streamwrapper
-                    if chunk.choices[0].delta.content]
+                        if chunk.choices[0].delta.content
                    ]
                )
                function_call = json.loads(response_content)
                delta = litellm.utils.Delta(
@ -324,7 +333,10 @@ async def ollama_async_streaming(url, data, model_response, encoding, logging_ob
                    tool_calls=[
                        {
                            "id": f"call_{str(uuid.uuid4())}",
-                            "function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])},
+                            "function": {
                                "name": function_call["name"],
                                "arguments": json.dumps(function_call["arguments"]),
                            },
                            "type": "function",
                        }
                    ],
@ -373,7 +385,10 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
                    tool_calls=[
                        {
                            "id": f"call_{str(uuid.uuid4())}",
-                            "function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])},
+                            "function": {
                                "name": function_call["name"],
                                "arguments": json.dumps(function_call["arguments"]),
                            },
                            "type": "function",
                        }
                    ],
@ -387,7 +402,10 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
            model_response["created"] = int(time.time())
            model_response["model"] = "ollama/" + data["model"]
            prompt_tokens = response_json.get("prompt_eval_count", len(encoding.encode(data["prompt"], disallowed_special=())))  # type: ignore
-            completion_tokens = response_json.get("eval_count", len(response_json.get("message",dict()).get("content", "")))
+            completion_tokens = response_json.get(
                "eval_count",
                len(response_json.get("message", dict()).get("content", "")),
            )
            model_response["usage"] = litellm.Usage(
                prompt_tokens=prompt_tokens,
                completion_tokens=completion_tokens,
@ -474,3 +492,25 @@ async def ollama_aembeddings(
        "total_tokens": total_input_tokens,
    }
    return model_response
 def ollama_embeddings(
    api_base: str,
    model: str,
    prompts: list,
    optional_params=None,
    logging_obj=None,
    model_response=None,
    encoding=None,
 ):
    return asyncio.run(
        ollama_aembeddings(
            api_base,
            model,
            prompts,
            optional_params,
            logging_obj,
            model_response,
            encoding,
        )
    )
--- a/litellm/llms/oobabooga.py
+++ b/litellm/llms/oobabooga.py
@ -1,7 +1,7 @@
 import os
 import json
 from enum import Enum
-import requests
+import requests  # type: ignore
 import time
 from typing import Callable, Optional
 from litellm.utils import ModelResponse, Usage
--- a/litellm/llms/openai.py
+++ b/litellm/llms/openai.py
@ -22,7 +22,6 @@ from litellm.utils import (
    TextCompletionResponse,
 )
 from typing import Callable, Optional
 import aiohttp, requests
 import litellm
 from .prompt_templates.factory import prompt_factory, custom_prompt
 from openai import OpenAI, AsyncOpenAI
@ -531,6 +530,7 @@ class OpenAIChatCompletion(BaseLLM):
            model=model,
            custom_llm_provider="openai",
            logging_obj=logging_obj,
            stream_options=data.get("stream_options", None),
        )
        return streamwrapper
@ -580,6 +580,7 @@ class OpenAIChatCompletion(BaseLLM):
                model=model,
                custom_llm_provider="openai",
                logging_obj=logging_obj,
                stream_options=data.get("stream_options", None),
            )
            return streamwrapper
        except (
--- a/litellm/llms/petals.py
+++ b/litellm/llms/petals.py
@ -1,7 +1,7 @@
 import os, types
 import json
 from enum import Enum
-import requests
+import requests  # type: ignore
 import time
 from typing import Callable, Optional
 import litellm
--- a/litellm/llms/prompt_templates/factory.py
+++ b/litellm/llms/prompt_templates/factory.py
@ -981,7 +981,7 @@ def anthropic_messages_pt(messages: list):
    # add role=tool support to allow function call result/error submission
    user_message_types = {"user", "tool", "function"}
    # reformat messages to ensure user/assistant are alternating, if there's either 2 consecutive 'user' messages or 2 consecutive 'assistant' message, merge them.
-    new_messages = []
+    new_messages: list = []
    msg_i = 0
    tool_use_param = False
    while msg_i < len(messages):
--- a/litellm/llms/replicate.py
+++ b/litellm/llms/replicate.py
@ -1,11 +1,11 @@
 import os, types
 import json
-import requests
+import requests  # type: ignore
 import time
 from typing import Callable, Optional
 from litellm.utils import ModelResponse, Usage
 import litellm
-import httpx
+import httpx  # type: ignore
 from .prompt_templates.factory import prompt_factory, custom_prompt
--- a/litellm/llms/sagemaker.py
+++ b/litellm/llms/sagemaker.py
@ -1,14 +1,14 @@
 import os, types, traceback
 from enum import Enum
 import json
-import requests
+import requests  # type: ignore
 import time
 from typing import Callable, Optional, Any
 import litellm
 from litellm.utils import ModelResponse, EmbeddingResponse, get_secret, Usage
 import sys
 from copy import deepcopy
-import httpx
+import httpx  # type: ignore
 from .prompt_templates.factory import prompt_factory, custom_prompt
@ -295,7 +295,7 @@ def completion(
                EndpointName={model},
                InferenceComponentName={model_id},
                ContentType="application/json",
-                Body={data},
+                Body={data}, # type: ignore
                CustomAttributes="accept_eula=true",
            )
            """  # type: ignore
@ -321,7 +321,7 @@ def completion(
            response = client.invoke_endpoint(
                EndpointName={model},
                ContentType="application/json",
-                Body={data},
+                Body={data}, # type: ignore
                CustomAttributes="accept_eula=true",
            )
            """  # type: ignore
@ -688,7 +688,7 @@ def embedding(
    response = client.invoke_endpoint(
        EndpointName={model},
        ContentType="application/json",
-        Body={data},
+        Body={data}, # type: ignore
        CustomAttributes="accept_eula=true",
    )"""  # type: ignore
    logging_obj.pre_call(
--- a/litellm/llms/together_ai.py
+++ b/litellm/llms/together_ai.py
@ -6,11 +6,11 @@ Reference: https://docs.together.ai/docs/openai-api-compatibility
 import os, types
 import json
 from enum import Enum
-import requests
+import requests  # type: ignore
 import time
 from typing import Callable, Optional
 import litellm
-import httpx
+import httpx  # type: ignore
 from litellm.utils import ModelResponse, Usage
 from .prompt_templates.factory import prompt_factory, custom_prompt
--- a/litellm/llms/vertex_ai.py
+++ b/litellm/llms/vertex_ai.py
@ -1,12 +1,12 @@
 import os, types
 import json
 from enum import Enum
-import requests
+import requests  # type: ignore
 import time
 from typing import Callable, Optional, Union, List
 from litellm.utils import ModelResponse, Usage, CustomStreamWrapper, map_finish_reason
 import litellm, uuid
-import httpx, inspect
+import httpx, inspect  # type: ignore
 class VertexAIError(Exception):
--- a/litellm/llms/vertex_ai_anthropic.py
+++ b/litellm/llms/vertex_ai_anthropic.py
@ -3,7 +3,7 @@
 import os, types
 import json
 from enum import Enum
-import requests, copy
+import requests, copy  # type: ignore
 import time, uuid
 from typing import Callable, Optional, List
 from litellm.utils import ModelResponse, Usage, map_finish_reason, CustomStreamWrapper
@ -17,7 +17,7 @@ from .prompt_templates.factory import (
    extract_between_tags,
    parse_xml_params,
 )
-import httpx
+import httpx  # type: ignore
 class VertexAIError(Exception):
--- a/litellm/llms/vllm.py
+++ b/litellm/llms/vllm.py
@ -1,8 +1,8 @@
 import os
 import json
 from enum import Enum
-import requests
+import requests  # type: ignore
-import time, httpx
+import time, httpx  # type: ignore
 from typing import Callable, Any
 from litellm.utils import ModelResponse, Usage
 from .prompt_templates.factory import prompt_factory, custom_prompt
--- a/litellm/llms/watsonx.py
+++ b/litellm/llms/watsonx.py
@ -3,8 +3,8 @@ import json, types, time  # noqa: E401
 from contextlib import contextmanager
 from typing import Callable, Dict, Optional, Any, Union, List
-import httpx
+import httpx  # type: ignore
-import requests
+import requests  # type: ignore
 import litellm
 from litellm.utils import ModelResponse, get_secret, Usage
--- a/litellm/main.py
+++ b/litellm/main.py
@ -12,9 +12,9 @@ from typing import Any, Literal, Union, BinaryIO
 from functools import partial
 import dotenv, traceback, random, asyncio, time, contextvars
 from copy import deepcopy
 import httpx
 import litellm
 from ._logging import verbose_logger
 from litellm import (  # type: ignore
    client,
@ -188,6 +188,7 @@ async def acompletion(
    top_p: Optional[float] = None,
    n: Optional[int] = None,
    stream: Optional[bool] = None,
    stream_options: Optional[dict] = None,
    stop=None,
    max_tokens: Optional[int] = None,
    presence_penalty: Optional[float] = None,
@ -207,6 +208,7 @@ async def acompletion(
    api_version: Optional[str] = None,
    api_key: Optional[str] = None,
    model_list: Optional[list] = None,  # pass in a list of api_base,keys, etc.
    extra_headers: Optional[dict] = None,
    # Optional liteLLM function params
    **kwargs,
 ):
@ -224,6 +226,7 @@ async def acompletion(
        top_p (float, optional): The top-p parameter for nucleus sampling (default is 1.0).
        n (int, optional): The number of completions to generate (default is 1).
        stream (bool, optional): If True, return a streaming response (default is False).
        stream_options (dict, optional): A dictionary containing options for the streaming response. Only use this if stream is True.
        stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens.
        max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity).
        presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far.
@ -261,6 +264,7 @@ async def acompletion(
        "top_p": top_p,
        "n": n,
        "stream": stream,
        "stream_options": stream_options,
        "stop": stop,
        "max_tokens": max_tokens,
        "presence_penalty": presence_penalty,
@ -305,6 +309,7 @@ async def acompletion(
            or custom_llm_provider == "deepinfra"
            or custom_llm_provider == "perplexity"
            or custom_llm_provider == "groq"
            or custom_llm_provider == "deepseek"
            or custom_llm_provider == "text-completion-openai"
            or custom_llm_provider == "huggingface"
            or custom_llm_provider == "ollama"
@ -457,6 +462,7 @@ def completion(
    top_p: Optional[float] = None,
    n: Optional[int] = None,
    stream: Optional[bool] = None,
    stream_options: Optional[dict] = None,
    stop=None,
    max_tokens: Optional[int] = None,
    presence_penalty: Optional[float] = None,
@ -496,6 +502,7 @@ def completion(
        top_p (float, optional): The top-p parameter for nucleus sampling (default is 1.0).
        n (int, optional): The number of completions to generate (default is 1).
        stream (bool, optional): If True, return a streaming response (default is False).
        stream_options (dict, optional): A dictionary containing options for the streaming response. Only set this when you set stream: true.
        stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens.
        max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity).
        presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far.
@ -573,6 +580,7 @@ def completion(
        "top_p",
        "n",
        "stream",
        "stream_options",
        "stop",
        "max_tokens",
        "presence_penalty",
@ -648,6 +656,8 @@ def completion(
        "base_model",
        "stream_timeout",
        "supports_system_message",
        "region_name",
        "allowed_model_region",
    ]
    default_params = openai_params + litellm_params
    non_default_params = {
@ -783,6 +793,7 @@ def completion(
            top_p=top_p,
            n=n,
            stream=stream,
            stream_options=stream_options,
            stop=stop,
            max_tokens=max_tokens,
            presence_penalty=presence_penalty,
@ -982,6 +993,7 @@ def completion(
            or custom_llm_provider == "deepinfra"
            or custom_llm_provider == "perplexity"
            or custom_llm_provider == "groq"
            or custom_llm_provider == "deepseek"
            or custom_llm_provider == "anyscale"
            or custom_llm_provider == "mistral"
            or custom_llm_provider == "openai"
@ -2565,6 +2577,7 @@ async def aembedding(*args, **kwargs):
            or custom_llm_provider == "deepinfra"
            or custom_llm_provider == "perplexity"
            or custom_llm_provider == "groq"
            or custom_llm_provider == "deepseek"
            or custom_llm_provider == "fireworks_ai"
            or custom_llm_provider == "ollama"
            or custom_llm_provider == "vertex_ai"
@ -2714,6 +2727,8 @@ def embedding(
        "ttl",
        "cache",
        "no-log",
        "region_name",
        "allowed_model_region",
    ]
    default_params = openai_params + litellm_params
    non_default_params = {
@ -2947,8 +2962,10 @@ def embedding(
                    model=model,  # type: ignore
                    llm_provider="ollama",  # type: ignore
                )
-            if aembedding:
+            ollama_embeddings_fn = (
-                response = ollama.ollama_aembeddings(
+                ollama.ollama_aembeddings if aembedding else ollama.ollama_embeddings
            )
            response = ollama_embeddings_fn(
                api_base=api_base,
                model=model,
                prompts=input,
@ -3085,11 +3102,13 @@ async def atext_completion(*args, **kwargs):
            or custom_llm_provider == "deepinfra"
            or custom_llm_provider == "perplexity"
            or custom_llm_provider == "groq"
            or custom_llm_provider == "deepseek"
            or custom_llm_provider == "fireworks_ai"
            or custom_llm_provider == "text-completion-openai"
            or custom_llm_provider == "huggingface"
            or custom_llm_provider == "ollama"
            or custom_llm_provider == "vertex_ai"
            or custom_llm_provider in litellm.openai_compatible_providers
        ):  # currently implemented aiohttp calls for just azure and openai, soon all.
            # Await normally
            response = await loop.run_in_executor(None, func_with_context)
@ -3120,6 +3139,8 @@ async def atext_completion(*args, **kwargs):
            ## TRANSLATE CHAT TO TEXT FORMAT ##
            if isinstance(response, TextCompletionResponse):
                return response
            elif asyncio.iscoroutine(response):
                response = await response
            text_completion_response = TextCompletionResponse()
            text_completion_response["id"] = response.get("id", None)
@ -3581,6 +3602,8 @@ def image_generation(
            "caching_groups",
            "ttl",
            "cache",
            "region_name",
            "allowed_model_region",
        ]
        default_params = openai_params + litellm_params
        non_default_params = {
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -739,6 +739,24 @@
        "litellm_provider": "mistral",
        "mode": "embedding"
    },
    "deepseek-chat": {
        "max_tokens": 4096,
        "max_input_tokens": 32000,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.00000014,
        "output_cost_per_token": 0.00000028,
        "litellm_provider": "deepseek",
        "mode": "chat"
    },
    "deepseek-coder": {
        "max_tokens": 4096,
        "max_input_tokens": 16000,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.00000014,
        "output_cost_per_token": 0.00000028,
        "litellm_provider": "deepseek",
        "mode": "chat"
    },
    "groq/llama2-70b-4096": {
        "max_tokens": 4096,
        "max_input_tokens": 4096,
@ -1060,8 +1078,8 @@
        "max_tokens": 8192,
        "max_input_tokens": 1000000,
        "max_output_tokens": 8192,
-        "input_cost_per_token": 0, 
+        "input_cost_per_token": 0.000000625, 
-        "output_cost_per_token": 0,
+        "output_cost_per_token": 0.000001875,
        "litellm_provider": "vertex_ai-language-models",
        "mode": "chat",
        "supports_function_calling": true,
@ -1072,8 +1090,8 @@
        "max_tokens": 8192,
        "max_input_tokens": 1000000,
        "max_output_tokens": 8192,
-        "input_cost_per_token": 0, 
+        "input_cost_per_token": 0.000000625, 
-        "output_cost_per_token": 0,
+        "output_cost_per_token": 0.000001875,
        "litellm_provider": "vertex_ai-language-models",
        "mode": "chat",
        "supports_function_calling": true,
@ -1084,8 +1102,8 @@
        "max_tokens": 8192,
        "max_input_tokens": 1000000,
        "max_output_tokens": 8192,
-        "input_cost_per_token": 0,
+        "input_cost_per_token": 0.000000625, 
-        "output_cost_per_token": 0,
+        "output_cost_per_token": 0.000001875,
        "litellm_provider": "vertex_ai-language-models",
        "mode": "chat",
        "supports_function_calling": true,
--- a/litellm/proxy/_experimental/out/404.html
+++ b/litellm/proxy/_experimental/out/404.html
--- a/litellm/proxy/_experimental/out/_next/static/K8KXTbmuI2ArWjjdMi2iq/_buildManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/K8KXTbmuI2ArWjjdMi2iq/_buildManifest.js
--- a/litellm/proxy/_experimental/out/_next/static/K8KXTbmuI2ArWjjdMi2iq/_ssgManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/K8KXTbmuI2ArWjjdMi2iq/_ssgManifest.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/566-ccd699ab19124658.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/566-ccd699ab19124658.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/layout-94ae1345f5d85446.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/layout-94ae1345f5d85446.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/page-c804e862b63be987.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/page-c804e862b63be987.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/page-d9bdfedbff191985.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/page-d9bdfedbff191985.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/webpack-5b257e1ab47d4b4a.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/webpack-5b257e1ab47d4b4a.js
@ -1 +1 @@
-!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function d(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={id:e,loaded:!1,exports:{}},r=!0;try{a[e](n,n.exports,d),r=!1}finally{r&&delete l[e]}return n.loaded=!0,n.exports}d.m=a,e=[],d.O=function(t,n,r,o){if(n){o=o||0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(d.O).every(function(e){return d.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},d.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return d.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},d.t=function(e,r){if(1&r&&(e=this(e)),8&r||"object"==typeof e&&e&&(4&r&&e.__esModule||16&r&&"function"==typeof e.then))return e;var o=Object.create(null);d.r(o);var u={};t=t||[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},d.d(o,u),o},d.d=function(e,t){for(var n in t)d.o(t,n)&&!d.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},d.f={},d.e=function(e){return Promise.all(Object.keys(d.f).reduce(function(t,n){return d.f[n](e,t),t},[]))},d.u=function(e){},d.miniCssF=function(e){return"static/css/00c2ddbcd01819c0.css"},d.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||Function("return this")()}catch(e){if("object"==typeof window)return window}}(),d.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",d.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e||l.getAttribute("data-webpack")==o+n){i=l;break}}i||(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,d.nc&&i.setAttribute("nonce",d.nc),i.setAttribute("data-webpack",o+n),i.src=d.tu(e)),r[e]=[t];var s=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(s.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=s.bind(null,i.onerror),i.onload=s.bind(null,i.onload),c&&document.head.appendChild(i)},d.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.nmd=function(e){return e.paths=[],e.children||(e.children=[]),e},d.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},d.tu=function(e){return d.tt().createScriptURL(e)},d.p="/ui/_next/",i={272:0},d.f.j=function(e,t){var n=d.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=d.p+d.u(e),u=Error();d.l(o,function(t){if(d.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},d.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)d.o(u,n)&&(d.m[n]=u[n]);if(c)var a=c(d)}for(e&&e(t);f<o.length;f++)r=o[f],d.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return d.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E||[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();
+!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function d(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={id:e,loaded:!1,exports:{}},r=!0;try{a[e](n,n.exports,d),r=!1}finally{r&&delete l[e]}return n.loaded=!0,n.exports}d.m=a,e=[],d.O=function(t,n,r,o){if(n){o=o||0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(d.O).every(function(e){return d.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},d.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return d.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},d.t=function(e,r){if(1&r&&(e=this(e)),8&r||"object"==typeof e&&e&&(4&r&&e.__esModule||16&r&&"function"==typeof e.then))return e;var o=Object.create(null);d.r(o);var u={};t=t||[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},d.d(o,u),o},d.d=function(e,t){for(var n in t)d.o(t,n)&&!d.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},d.f={},d.e=function(e){return Promise.all(Object.keys(d.f).reduce(function(t,n){return d.f[n](e,t),t},[]))},d.u=function(e){},d.miniCssF=function(e){return"static/css/a1602eb39f799143.css"},d.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||Function("return this")()}catch(e){if("object"==typeof window)return window}}(),d.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",d.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e||l.getAttribute("data-webpack")==o+n){i=l;break}}i||(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,d.nc&&i.setAttribute("nonce",d.nc),i.setAttribute("data-webpack",o+n),i.src=d.tu(e)),r[e]=[t];var s=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(s.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=s.bind(null,i.onerror),i.onload=s.bind(null,i.onload),c&&document.head.appendChild(i)},d.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.nmd=function(e){return e.paths=[],e.children||(e.children=[]),e},d.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},d.tu=function(e){return d.tt().createScriptURL(e)},d.p="/ui/_next/",i={272:0},d.f.j=function(e,t){var n=d.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=d.p+d.u(e),u=Error();d.l(o,function(t){if(d.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},d.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)d.o(u,n)&&(d.m[n]=u[n]);if(c)var a=c(d)}for(e&&e(t);f<o.length;f++)r=o[f],d.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return d.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E||[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();
--- a/litellm/proxy/_experimental/out/_next/static/css/00c2ddbcd01819c0.css
+++ b/litellm/proxy/_experimental/out/_next/static/css/00c2ddbcd01819c0.css
--- a/litellm/proxy/_experimental/out/_next/static/css/a1602eb39f799143.css
+++ b/litellm/proxy/_experimental/out/_next/static/css/a1602eb39f799143.css
--- a/litellm/proxy/_experimental/out/index.html
+++ b/litellm/proxy/_experimental/out/index.html
@ -1 +1 @@
-<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-202e312607f242a1.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-dafd44dfa2da140c.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-e49705773ae41779.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-202e312607f242a1.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/00c2ddbcd01819c0.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[58854,[\"936\",\"static/chunks/2f6dbc85-17d29013b8ff3da5.js\",\"142\",\"static/chunks/142-11990a208bf93746.js\",\"931\",\"static/chunks/app/page-d9bdfedbff191985.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/00c2ddbcd01819c0.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"e55gTzpa2g2-9SwXgA9Uo\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
+<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-5b257e1ab47d4b4a.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-dafd44dfa2da140c.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-e49705773ae41779.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-5b257e1ab47d4b4a.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/a1602eb39f799143.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[25539,[\"936\",\"static/chunks/2f6dbc85-17d29013b8ff3da5.js\",\"566\",\"static/chunks/566-ccd699ab19124658.js\",\"931\",\"static/chunks/app/page-c804e862b63be987.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/a1602eb39f799143.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"K8KXTbmuI2ArWjjdMi2iq\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
--- a/litellm/proxy/_experimental/out/index.txt
+++ b/litellm/proxy/_experimental/out/index.txt
@ -1,7 +1,7 @@
 2:I[77831,[],""]
-3:I[58854,["936","static/chunks/2f6dbc85-17d29013b8ff3da5.js","142","static/chunks/142-11990a208bf93746.js","931","static/chunks/app/page-d9bdfedbff191985.js"],""]
+3:I[25539,["936","static/chunks/2f6dbc85-17d29013b8ff3da5.js","566","static/chunks/566-ccd699ab19124658.js","931","static/chunks/app/page-c804e862b63be987.js"],""]
 4:I[5613,[],""]
 5:I[31778,[],""]
-0:["e55gTzpa2g2-9SwXgA9Uo",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/00c2ddbcd01819c0.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
+0:["K8KXTbmuI2ArWjjdMi2iq",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/a1602eb39f799143.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
 6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
 1:null
--- a/litellm/proxy/_super_secret_config.yaml
+++ b/litellm/proxy/_super_secret_config.yaml
@ -4,6 +4,22 @@ model_list:
    api_key: my-fake-key
    model: openai/my-fake-model
  model_name: fake-openai-endpoint
 - litellm_params:
    api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
    api_key: my-fake-key-2
    model: openai/my-fake-model-2
  model_name: fake-openai-endpoint
 - litellm_params:
    api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
    api_key: my-fake-key-3
    model: openai/my-fake-model-3
  model_name: fake-openai-endpoint
 - model_name: gpt-4
  litellm_params:
    model: gpt-3.5-turbo
 - litellm_params:
    model: together_ai/codellama/CodeLlama-13b-Instruct-hf
  model_name: CodeLlama-13b-Instruct
 router_settings:
  num_retries: 0
  enable_pre_call_checks: true
@ -15,8 +31,11 @@ router_settings:
  routing_strategy: "latency-based-routing"
 litellm_settings:
-  success_callback: ["openmeter"]
+  success_callback: ["langfuse"]
 general_settings:
  alerting: ["slack"]
-  alert_types: ["llm_exceptions"]
+  alert_types: ["llm_exceptions", "daily_reports"]
  alerting_args:
    daily_report_frequency: 60 # every minute
    report_check_interval: 5 # every 5s 
--- a/litellm/proxy/_types.py
+++ b/litellm/proxy/_types.py
@ -458,6 +458,27 @@ class UpdateUserRequest(GenerateRequestBase):
        return values
 class NewEndUserRequest(LiteLLMBase):
    user_id: str
    alias: Optional[str] = None  # human-friendly alias
    blocked: bool = False  # allow/disallow requests for this end-user
    max_budget: Optional[float] = None
    budget_id: Optional[str] = None  # give either a budget_id or max_budget
    allowed_model_region: Optional[Literal["eu"]] = (
        None  # require all user requests to use models in this specific region
    )
    default_model: Optional[str] = (
        None  # if no equivalent model in allowed region - default all requests to this model
    )
    @root_validator(pre=True)
    def check_user_info(cls, values):
        if values.get("max_budget") is not None and values.get("budget_id") is not None:
            raise ValueError("Set either 'max_budget' or 'budget_id', not both.")
        return values
 class Member(LiteLLMBase):
    role: Literal["admin", "user"]
    user_id: Optional[str] = None
@ -494,6 +515,8 @@ class NewTeamRequest(TeamBase):
 class GlobalEndUsersSpend(LiteLLMBase):
    api_key: Optional[str] = None
    startTime: Optional[datetime] = None
    endTime: Optional[datetime] = None
 class TeamMemberAddRequest(LiteLLMBase):
@ -836,6 +859,7 @@ class UserAPIKeyAuth(
    api_key: Optional[str] = None
    user_role: Optional[Literal["proxy_admin", "app_owner", "app_user"]] = None
    allowed_model_region: Optional[Literal["eu"]] = None
    @root_validator(pre=True)
    def check_api_key(cls, values):
@ -881,6 +905,8 @@ class LiteLLM_EndUserTable(LiteLLMBase):
    blocked: bool
    alias: Optional[str] = None
    spend: float = 0.0
    allowed_model_region: Optional[Literal["eu"]] = None
    default_model: Optional[str] = None
    litellm_budget_table: Optional[LiteLLM_BudgetTable] = None
    @root_validator(pre=True)
--- a/litellm/proxy/auth/auth_checks.py
+++ b/litellm/proxy/auth/auth_checks.py
@ -206,9 +206,9 @@ async def get_end_user_object(
    if end_user_id is None:
        return None
-
+    _key = "end_user_id:{}".format(end_user_id)
    # check if in cache
-    cached_user_obj = user_api_key_cache.async_get_cache(key=end_user_id)
+    cached_user_obj = await user_api_key_cache.async_get_cache(key=_key)
    if cached_user_obj is not None:
        if isinstance(cached_user_obj, dict):
            return LiteLLM_EndUserTable(**cached_user_obj)
@ -223,7 +223,14 @@ async def get_end_user_object(
        if response is None:
            raise Exception
-        return LiteLLM_EndUserTable(**response.dict())
+        # save the end-user object to cache
        await user_api_key_cache.async_set_cache(
            key="end_user_id:{}".format(end_user_id), value=response
        )
        _response = LiteLLM_EndUserTable(**response.dict())
        return _response
    except Exception as e:  # if end-user not in db
        return None
--- a/litellm/proxy/auth/handle_jwt.py
+++ b/litellm/proxy/auth/handle_jwt.py
@ -15,6 +15,9 @@ from litellm.proxy._types import LiteLLM_JWTAuth, LiteLLM_UserTable
 from litellm.proxy.utils import PrismaClient
 from litellm.llms.custom_httpx.httpx_handler import HTTPHandler
 from typing import Optional
 from cryptography import x509
 from cryptography.hazmat.backends import default_backend
 from cryptography.hazmat.primitives import serialization
 class JWTHandler:
@ -142,8 +145,8 @@ class JWTHandler:
                public_key = keys[0]
        elif len(keys) > 1:
            for key in keys:
-                if kid is not None and key["kid"] == kid:
+                if kid is not None and key == kid:
-                    public_key = key
+                    public_key = keys[key]
        if public_key is None:
            raise Exception(
@ -153,6 +156,11 @@ class JWTHandler:
        return public_key
    async def auth_jwt(self, token: str) -> dict:
        audience = os.getenv("JWT_AUDIENCE")
        decode_options = None
        if audience is None:
            decode_options = {"verify_aud": False}
        from jwt.algorithms import RSAAlgorithm
        header = jwt.get_unverified_header(token)
@ -182,7 +190,33 @@ class JWTHandler:
                    token,
                    public_key_rsa,  # type: ignore
                    algorithms=["RS256"],
-                    options={"verify_aud": False},
+                    options=decode_options,
                    audience=audience,
                )
                return payload
            except jwt.ExpiredSignatureError:
                # the token is expired, do something to refresh it
                raise Exception("Token Expired")
            except Exception as e:
                raise Exception(f"Validation fails: {str(e)}")
        elif public_key is not None and isinstance(public_key, str):
            try:
                cert = x509.load_pem_x509_certificate(public_key.encode(), default_backend())
                # Extract public key
                key = cert.public_key().public_bytes(
                    serialization.Encoding.PEM,
                    serialization.PublicFormat.SubjectPublicKeyInfo
                )
                # decode the token using the public key
                payload = jwt.decode(
                    token,
                    key,
                    algorithms=["RS256"],
                    audience=audience,
                    options=decode_options
                )
                return payload
--- a/litellm/proxy/proxy_cli.py
+++ b/litellm/proxy/proxy_cli.py
@ -252,7 +252,7 @@ def run_server(
    if model and "ollama" in model and api_base is None:
        run_ollama_serve()
    if test_async is True:
-        import requests, concurrent, time
+        import requests, concurrent, time  # type: ignore
        api_base = f"http://{host}:{port}"
@ -418,7 +418,7 @@ def run_server(
            read from there and save it to os.env['DATABASE_URL']
            """
            try:
-                import yaml, asyncio
+                import yaml, asyncio  # type: ignore
            except:
                raise ImportError(
                    "yaml needs to be imported. Run - `pip install 'litellm[proxy]'`"
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
--- a/litellm/proxy/schema.prisma
+++ b/litellm/proxy/schema.prisma
@ -150,6 +150,8 @@ model LiteLLM_EndUserTable {
  user_id String @id
  alias    String? // admin-facing alias
  spend      Float    @default(0.0)
  allowed_model_region String? // require all user requests to use models in this specific region
  default_model String? // use along with 'allowed_model_region'. if no available model in region, default to this model.
  budget_id String?
  litellm_budget_table LiteLLM_BudgetTable?   @relation(fields: [budget_id], references: [budget_id])
  blocked Boolean @default(false)
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@ -73,6 +73,7 @@ class ProxyLogging:
                "llm_requests_hanging",
                "budget_alerts",
                "db_exceptions",
                "daily_reports",
            ]
        ] = [
            "llm_exceptions",
@ -80,11 +81,13 @@ class ProxyLogging:
            "llm_requests_hanging",
            "budget_alerts",
            "db_exceptions",
            "daily_reports",
        ]
        self.slack_alerting_instance = SlackAlerting(
            alerting_threshold=self.alerting_threshold,
            alerting=self.alerting,
            alert_types=self.alert_types,
            internal_usage_cache=self.internal_usage_cache,
        )
    def update_values(
@ -100,9 +103,11 @@ class ProxyLogging:
                    "llm_requests_hanging",
                    "budget_alerts",
                    "db_exceptions",
                    "daily_reports",
                ]
            ]
        ] = None,
        alerting_args: Optional[dict] = None,
    ):
        self.alerting = alerting
        if alerting_threshold is not None:
@ -114,8 +119,12 @@ class ProxyLogging:
            alerting=self.alerting,
            alerting_threshold=self.alerting_threshold,
            alert_types=self.alert_types,
            alerting_args=alerting_args,
        )
        if "daily_reports" in self.alert_types:
            litellm.callbacks.append(self.slack_alerting_instance)  # type: ignore
        if redis_cache is not None:
            self.internal_usage_cache.redis_cache = redis_cache
@ -293,6 +302,7 @@ class ProxyLogging:
            "budget_alerts",
            "db_exceptions",
        ],
        request_data: Optional[dict] = None,
    ):
        """
        Alerting based on thresholds: - https://github.com/BerriAI/litellm/issues/1298
@ -322,10 +332,19 @@ class ProxyLogging:
        if _proxy_base_url is not None:
            formatted_message += f"\n\nProxy URL: `{_proxy_base_url}`"
        extra_kwargs = {}
        if request_data is not None:
            _url = self.slack_alerting_instance._add_langfuse_trace_id_to_alert(
                request_data=request_data
            )
            if _url is not None:
                extra_kwargs["🪢 Langfuse Trace"] = _url
                formatted_message += "\n\n🪢 Langfuse Trace: {}".format(_url)
        for client in self.alerting:
            if client == "slack":
                await self.slack_alerting_instance.send_alert(
-                    message=message, level=level, alert_type=alert_type
+                    message=message, level=level, alert_type=alert_type, **extra_kwargs
                )
            elif client == "sentry":
                if litellm.utils.sentry_sdk_instance is not None:
@ -360,6 +379,7 @@ class ProxyLogging:
                message=f"DB read/write call failed: {error_message}",
                level="High",
                alert_type="db_exceptions",
                request_data={},
            )
        )
@ -375,7 +395,10 @@ class ProxyLogging:
            litellm.utils.capture_exception(error=original_exception)
    async def post_call_failure_hook(
-        self, original_exception: Exception, user_api_key_dict: UserAPIKeyAuth
+        self,
        original_exception: Exception,
        user_api_key_dict: UserAPIKeyAuth,
        request_data: dict,
    ):
        """
        Allows users to raise custom exceptions/log when a call fails, without having to deal with parsing Request body.
@ -400,6 +423,7 @@ class ProxyLogging:
                    message=f"LLM API call failed: {str(original_exception)}",
                    level="High",
                    alert_type="llm_exceptions",
                    request_data=request_data,
                )
            )
@ -502,7 +526,7 @@ class PrismaClient:
            finally:
                os.chdir(original_dir)
            # Now you can import the Prisma Client
-            from prisma import Prisma  # type: ignore
+            from prisma import Prisma
        self.db = Prisma()  # Client to connect to Prisma db
@ -1665,12 +1689,12 @@ def get_instance_fn(value: str, config_file_path: Optional[str] = None) -> Any:
            module_file_path = os.path.join(directory, *module_name.split("."))
            module_file_path += ".py"
-            spec = importlib.util.spec_from_file_location(module_name, module_file_path)
+            spec = importlib.util.spec_from_file_location(module_name, module_file_path)  # type: ignore
            if spec is None:
                raise ImportError(
                    f"Could not find a module specification for {module_file_path}"
                )
-            module = importlib.util.module_from_spec(spec)
+            module = importlib.util.module_from_spec(spec)  # type: ignore
            spec.loader.exec_module(module)  # type: ignore
        else:
            # Dynamically import the module
--- a/litellm/router.py
+++ b/litellm/router.py
@ -21,6 +21,7 @@ from collections import defaultdict
 from litellm.router_strategy.least_busy import LeastBusyLoggingHandler
 from litellm.router_strategy.lowest_tpm_rpm import LowestTPMLoggingHandler
 from litellm.router_strategy.lowest_latency import LowestLatencyLoggingHandler
 from litellm.router_strategy.lowest_cost import LowestCostLoggingHandler
 from litellm.router_strategy.lowest_tpm_rpm_v2 import LowestTPMLoggingHandler_v2
 from litellm.llms.custom_httpx.azure_dall_e_2 import (
    CustomHTTPTransport,
@ -31,6 +32,7 @@ from litellm.utils import (
    CustomStreamWrapper,
    get_utc_datetime,
    calculate_max_parallel_requests,
    _is_region_eu,
 )
 import copy
 from litellm._logging import verbose_router_logger
@ -43,6 +45,7 @@ from litellm.types.router import (
    updateDeployment,
    updateLiteLLMParams,
    RetryPolicy,
    AlertingConfig,
 )
 from litellm.integrations.custom_logger import CustomLogger
@ -98,9 +101,11 @@ class Router:
            "least-busy",
            "usage-based-routing",
            "latency-based-routing",
            "cost-based-routing",
        ] = "simple-shuffle",
        routing_strategy_args: dict = {},  # just for latency-based routing
        semaphore: Optional[asyncio.Semaphore] = None,
        alerting_config: Optional[AlertingConfig] = None,
    ) -> None:
        """
        Initialize the Router class with the given parameters for caching, reliability, and routing strategy.
@ -127,9 +132,9 @@ class Router:
            retry_after (int): Minimum time to wait before retrying a failed request. Defaults to 0.
            allowed_fails (Optional[int]): Number of allowed fails before adding to cooldown. Defaults to None.
            cooldown_time (float): Time to cooldown a deployment after failure in seconds. Defaults to 1.
-            routing_strategy (Literal["simple-shuffle", "least-busy", "usage-based-routing", "latency-based-routing"]): Routing strategy. Defaults to "simple-shuffle".
+            routing_strategy (Literal["simple-shuffle", "least-busy", "usage-based-routing", "latency-based-routing", "cost-based-routing"]): Routing strategy. Defaults to "simple-shuffle".
            routing_strategy_args (dict): Additional args for latency-based routing. Defaults to {}.
-
+            alerting_config (AlertingConfig): Slack alerting configuration. Defaults to None.
        Returns:
            Router: An instance of the litellm.Router class.
@ -314,6 +319,9 @@ class Router:
        self.model_group_retry_policy: Optional[Dict[str, RetryPolicy]] = (
            model_group_retry_policy
        )
        self.alerting_config: Optional[AlertingConfig] = alerting_config
        if self.alerting_config is not None:
            self._initialize_alerting()
    def routing_strategy_init(self, routing_strategy: str, routing_strategy_args: dict):
        if routing_strategy == "least-busy":
@ -347,6 +355,14 @@ class Router:
            )
            if isinstance(litellm.callbacks, list):
                litellm.callbacks.append(self.lowestlatency_logger)  # type: ignore
        elif routing_strategy == "cost-based-routing":
            self.lowestcost_logger = LowestCostLoggingHandler(
                router_cache=self.cache,
                model_list=self.model_list,
                routing_args={},
            )
            if isinstance(litellm.callbacks, list):
                litellm.callbacks.append(self.lowestcost_logger)  # type: ignore
    def print_deployment(self, deployment: dict):
        """
@ -1847,6 +1863,10 @@ class Router:
                self.cache.set_cache(
                    value=cached_value, key=cooldown_key, ttl=cooldown_time
                )
            self.send_deployment_cooldown_alert(
                deployment_id=deployment, exception_status=exception_status
            )
        else:
            self.failed_calls.set_cache(
                key=deployment, value=updated_fails, ttl=cooldown_time
@ -1980,7 +2000,11 @@ class Router:
            # user can pass vars directly or they can pas os.environ/AZURE_API_KEY, in which case we will read the env
            # we do this here because we init clients for Azure, OpenAI and we need to set the right key
            api_key = litellm_params.get("api_key") or default_api_key
-            if api_key and api_key.startswith("os.environ/"):
+            if (
                api_key
                and isinstance(api_key, str)
                and api_key.startswith("os.environ/")
            ):
                api_key_env_name = api_key.replace("os.environ/", "")
                api_key = litellm.get_secret(api_key_env_name)
                litellm_params["api_key"] = api_key
@ -2004,6 +2028,7 @@ class Router:
            if (
                is_azure_ai_studio_model == True
                and api_base is not None
                and isinstance(api_base, str)
                and not api_base.endswith("/v1/")
            ):
                # check if it ends with a trailing slash
@ -2084,13 +2109,14 @@ class Router:
                organization = litellm.get_secret(organization_env_name)
                litellm_params["organization"] = organization
-            if "azure" in model_name:
+            if "azure" in model_name and isinstance(api_key, str):
-                if api_base is None:
+                if api_base is None or not isinstance(api_base, str):
                    raise ValueError(
                        f"api_base is required for Azure OpenAI. Set it on your config. Model - {model}"
                    )
                if api_version is None:
                    api_version = "2023-07-01-preview"
                if "gateway.ai.cloudflare.com" in api_base:
                    if not api_base.endswith("/"):
                        api_base += "/"
@ -2513,7 +2539,7 @@ class Router:
            self.default_deployment = deployment.to_json(exclude_none=True)
        # Azure GPT-Vision Enhancements, users can pass os.environ/
-        data_sources = deployment.litellm_params.get("dataSources", [])
+        data_sources = deployment.litellm_params.get("dataSources", []) or []
        for data_source in data_sources:
            params = data_source.get("parameters", {})
@ -2530,6 +2556,22 @@ class Router:
        # init OpenAI, Azure clients
        self.set_client(model=deployment.to_json(exclude_none=True))
        # set region (if azure model)
        try:
            if "azure" in deployment.litellm_params.model:
                region = litellm.utils.get_model_region(
                    litellm_params=deployment.litellm_params, mode=None
                )
                deployment.litellm_params.region_name = region
        except Exception as e:
            verbose_router_logger.error(
                "Unable to get the region for azure model - {}, {}".format(
                    deployment.litellm_params.model, str(e)
                )
            )
            pass  # [NON-BLOCKING]
        return deployment
    def add_deployment(self, deployment: Deployment) -> Optional[Deployment]:
@ -2557,6 +2599,38 @@ class Router:
        self.model_names.append(deployment.model_name)
        return deployment
    def upsert_deployment(self, deployment: Deployment) -> Deployment:
        """
        Add or update deployment
        Parameters:
        - deployment: Deployment - the deployment to be added to the Router
        Returns:
        - The added/updated deployment
        """
        # check if deployment already exists
        if deployment.model_info.id in self.get_model_ids():
            # remove the previous deployment
            removal_idx: Optional[int] = None
            for idx, model in enumerate(self.model_list):
                if model["model_info"]["id"] == deployment.model_info.id:
                    removal_idx = idx
            if removal_idx is not None:
                self.model_list.pop(removal_idx)
        # add to model list
        _deployment = deployment.to_json(exclude_none=True)
        self.model_list.append(_deployment)
        # initialize client
        self._add_deployment(deployment=deployment)
        # add to model names
        self.model_names.append(deployment.model_name)
        return deployment
    def delete_deployment(self, id: str) -> Optional[Deployment]:
        """
        Parameters:
@ -2580,11 +2654,21 @@ class Router:
        except:
            return None
-    def get_deployment(self, model_id: str):
+    def get_deployment(self, model_id: str) -> Optional[Deployment]:
        """
        Returns -> Deployment or None
        Raise Exception -> if model found in invalid format
        """
        for model in self.model_list:
            if "model_info" in model and "id" in model["model_info"]:
                if model_id == model["model_info"]["id"]:
                    if isinstance(model, dict):
                        return Deployment(**model)
                    elif isinstance(model, Deployment):
                        return model
                    else:
                        raise Exception("Model invalid format - {}".format(type(model)))
        return None
    def get_model_info(self, id: str) -> Optional[dict]:
@ -2597,7 +2681,10 @@ class Router:
                    return model
        return None
-    def get_model_ids(self):
+    def get_model_ids(self) -> List[str]:
        """
        Returns list of model id's.
        """
        ids = []
        for model in self.model_list:
            if "model_info" in model and "id" in model["model_info"]:
@ -2605,7 +2692,7 @@ class Router:
                ids.append(id)
        return ids
-    def get_model_names(self):
+    def get_model_names(self) -> List[str]:
        return self.model_names
    def get_model_list(self):
@ -2631,6 +2718,7 @@ class Router:
            "retry_after",
            "fallbacks",
            "context_window_fallbacks",
            "model_group_retry_policy",
        ]
        for var in vars_to_include:
@ -2656,6 +2744,7 @@ class Router:
            "retry_after",
            "fallbacks",
            "context_window_fallbacks",
            "model_group_retry_policy",
        ]
        _int_settings = [
@ -2754,14 +2843,17 @@ class Router:
        model: str,
        healthy_deployments: List,
        messages: List[Dict[str, str]],
        allowed_model_region: Optional[Literal["eu"]] = None,
    ):
        """
        Filter out model in model group, if:
        - model context window < message length
        - filter models above rpm limits
        - if region given, filter out models not in that region / unknown region
        - [TODO] function call and model doesn't support function calling
        """
        verbose_router_logger.debug(
            f"Starting Pre-call checks for deployments in model={model}"
        )
@ -2812,9 +2904,9 @@ class Router:
            except Exception as e:
                verbose_router_logger.debug("An error occurs - {}".format(str(e)))
            ## RPM CHECK ##
            _litellm_params = deployment.get("litellm_params", {})
            model_id = deployment.get("model_info", {}).get("id", "")
            ## RPM CHECK ##
            ### get local router cache ###
            current_request_cache_local = (
                self.cache.get_cache(key=model_id, local_only=True) or 0
@ -2842,6 +2934,28 @@ class Router:
                        _rate_limit_error = True
                        continue
            ## REGION CHECK ##
            if allowed_model_region is not None:
                if _litellm_params.get("region_name") is not None and isinstance(
                    _litellm_params["region_name"], str
                ):
                    # check if in allowed_model_region
                    if (
                        _is_region_eu(model_region=_litellm_params["region_name"])
                        == False
                    ):
                        invalid_model_indices.append(idx)
                        continue
                else:
                    verbose_router_logger.debug(
                        "Filtering out model - {}, as model_region=None, and allowed_model_region={}".format(
                            model_id, allowed_model_region
                        )
                    )
                    # filter out since region unknown, and user wants to filter for specific region
                    invalid_model_indices.append(idx)
                    continue
        if len(invalid_model_indices) == len(_returned_deployments):
            """
            - no healthy deployments available b/c context window checks or rate limit error
@ -2943,6 +3057,7 @@ class Router:
        if (
            self.routing_strategy != "usage-based-routing-v2"
            and self.routing_strategy != "simple-shuffle"
            and self.routing_strategy != "cost-based-routing"
        ):  # prevent regressions for other routing strategies, that don't have async get available deployments implemented.
            return self.get_available_deployment(
                model=model,
@ -2980,8 +3095,29 @@ class Router:
        # filter pre-call checks
        if self.enable_pre_call_checks and messages is not None:
            _allowed_model_region = (
                request_kwargs.get("allowed_model_region")
                if request_kwargs is not None
                else None
            )
            if _allowed_model_region == "eu":
                healthy_deployments = self._pre_call_checks(
-                model=model, healthy_deployments=healthy_deployments, messages=messages
+                    model=model,
                    healthy_deployments=healthy_deployments,
                    messages=messages,
                    allowed_model_region=_allowed_model_region,
                )
            else:
                verbose_router_logger.debug(
                    "Ignoring given 'allowed_model_region'={}. Only 'eu' is allowed".format(
                        _allowed_model_region
                    )
                )
                healthy_deployments = self._pre_call_checks(
                    model=model,
                    healthy_deployments=healthy_deployments,
                    messages=messages,
                )
        if len(healthy_deployments) == 0:
@ -2999,6 +3135,16 @@ class Router:
                messages=messages,
                input=input,
            )
        if (
            self.routing_strategy == "cost-based-routing"
            and self.lowestcost_logger is not None
        ):
            deployment = await self.lowestcost_logger.async_get_available_deployments(
                model_group=model,
                healthy_deployments=healthy_deployments,
                messages=messages,
                input=input,
            )
        elif self.routing_strategy == "simple-shuffle":
            # if users pass rpm or tpm, we do a random weighted pick - based on rpm/tpm
            ############## Check if we can do a RPM/TPM based weighted pick #################
@ -3266,6 +3412,8 @@ class Router:
        if retry_policy is None:
            return None
        if isinstance(retry_policy, dict):
            retry_policy = RetryPolicy(**retry_policy)
        if (
            isinstance(exception, litellm.BadRequestError)
            and retry_policy.BadRequestErrorRetries is not None
@ -3292,6 +3440,56 @@ class Router:
        ):
            return retry_policy.ContentPolicyViolationErrorRetries
    def _initialize_alerting(self):
        from litellm.integrations.slack_alerting import SlackAlerting
        router_alerting_config: AlertingConfig = self.alerting_config
        _slack_alerting_logger = SlackAlerting(
            alerting_threshold=router_alerting_config.alerting_threshold,
            alerting=["slack"],
            default_webhook_url=router_alerting_config.webhook_url,
        )
        litellm.callbacks.append(_slack_alerting_logger)
        litellm.success_callback.append(
            _slack_alerting_logger.response_taking_too_long_callback
        )
        print("\033[94m\nInitialized Alerting for litellm.Router\033[0m\n")  # noqa
    def send_deployment_cooldown_alert(
        self, deployment_id: str, exception_status: Union[str, int]
    ):
        try:
            from litellm.proxy.proxy_server import proxy_logging_obj
            # trigger slack alert saying deployment is in cooldown
            if (
                proxy_logging_obj is not None
                and proxy_logging_obj.alerting is not None
                and "slack" in proxy_logging_obj.alerting
            ):
                _deployment = self.get_deployment(model_id=deployment_id)
                if _deployment is None:
                    return
                _litellm_params = _deployment["litellm_params"]
                temp_litellm_params = copy.deepcopy(_litellm_params)
                temp_litellm_params = dict(temp_litellm_params)
                _model_name = _deployment.get("model_name", None)
                _api_base = litellm.get_api_base(
                    model=_model_name, optional_params=temp_litellm_params
                )
                asyncio.create_task(
                    proxy_logging_obj.slack_alerting_instance.send_alert(
                        message=f"Router: Cooling down deployment: {_api_base}, for {self.cooldown_time} seconds. Got exception: {str(exception_status)}",
                        alert_type="cooldown_deployment",
                        level="Low",
                    )
                )
        except Exception as e:
            pass
    def flush_cache(self):
        litellm.cache = None
        self.cache.flush_cache()
--- a/litellm/router_strategy/least_busy.py
+++ b/litellm/router_strategy/least_busy.py
@ -6,7 +6,7 @@
 #   - use litellm.success + failure callbacks to log when a request completed
 #   - in get_available_deployment, for a given model group name -> pick based on traffic
-import dotenv, os, requests, random
+import dotenv, os, requests, random  # type: ignore
 from typing import Optional
 dotenv.load_dotenv()  # Loading env variables using dotenv
--- a/litellm/router_strategy/lowest_cost.py
+++ b/litellm/router_strategy/lowest_cost.py
@ -0,0 +1,350 @@
 #### What this does ####
 #   picks based on response time (for streaming, this is time to first token)
 from pydantic import BaseModel, Extra, Field, root_validator
 import dotenv, os, requests, random  # type: ignore
 from typing import Optional, Union, List, Dict
 from datetime import datetime, timedelta
 import random
 dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
 from litellm.caching import DualCache
 from litellm.integrations.custom_logger import CustomLogger
 from litellm._logging import verbose_router_logger
 from litellm import ModelResponse
 from litellm import token_counter
 import litellm
 class LiteLLMBase(BaseModel):
    """
    Implements default functions, all pydantic objects should have.
    """
    def json(self, **kwargs):
        try:
            return self.model_dump()  # noqa
        except:
            # if using pydantic v1
            return self.dict()
 class LowestCostLoggingHandler(CustomLogger):
    test_flag: bool = False
    logged_success: int = 0
    logged_failure: int = 0
    def __init__(
        self, router_cache: DualCache, model_list: list, routing_args: dict = {}
    ):
        self.router_cache = router_cache
        self.model_list = model_list
    async def log_success_event(self, kwargs, response_obj, start_time, end_time):
        try:
            """
            Update usage on success
            """
            if kwargs["litellm_params"].get("metadata") is None:
                pass
            else:
                model_group = kwargs["litellm_params"]["metadata"].get(
                    "model_group", None
                )
                id = kwargs["litellm_params"].get("model_info", {}).get("id", None)
                if model_group is None or id is None:
                    return
                elif isinstance(id, int):
                    id = str(id)
                # ------------
                # Setup values
                # ------------
                """
                {
                    {model_group}_map: {
                        id: {
                            f"{date:hour:minute}" : {"tpm": 34, "rpm": 3}
                        }
                    }
                }
                """
                current_date = datetime.now().strftime("%Y-%m-%d")
                current_hour = datetime.now().strftime("%H")
                current_minute = datetime.now().strftime("%M")
                precise_minute = f"{current_date}-{current_hour}-{current_minute}"
                cost_key = f"{model_group}_map"
                response_ms: timedelta = end_time - start_time
                final_value = response_ms
                total_tokens = 0
                if isinstance(response_obj, ModelResponse):
                    completion_tokens = response_obj.usage.completion_tokens
                    total_tokens = response_obj.usage.total_tokens
                    final_value = float(response_ms.total_seconds() / completion_tokens)
                # ------------
                # Update usage
                # ------------
                request_count_dict = (
                    await self.router_cache.async_get_cache(key=cost_key) or {}
                )
                # check local result first
                if id not in request_count_dict:
                    request_count_dict[id] = {}
                if precise_minute not in request_count_dict[id]:
                    request_count_dict[id][precise_minute] = {}
                if precise_minute not in request_count_dict[id]:
                    request_count_dict[id][precise_minute] = {}
                ## TPM
                request_count_dict[id][precise_minute]["tpm"] = (
                    request_count_dict[id][precise_minute].get("tpm", 0) + total_tokens
                )
                ## RPM
                request_count_dict[id][precise_minute]["rpm"] = (
                    request_count_dict[id][precise_minute].get("rpm", 0) + 1
                )
                await self.router_cache.async_set_cache(
                    key=cost_key, value=request_count_dict
                )
                ### TESTING ###
                if self.test_flag:
                    self.logged_success += 1
        except Exception as e:
            traceback.print_exc()
            pass
    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
        try:
            """
            Update cost usage on success
            """
            if kwargs["litellm_params"].get("metadata") is None:
                pass
            else:
                model_group = kwargs["litellm_params"]["metadata"].get(
                    "model_group", None
                )
                id = kwargs["litellm_params"].get("model_info", {}).get("id", None)
                if model_group is None or id is None:
                    return
                elif isinstance(id, int):
                    id = str(id)
                # ------------
                # Setup values
                # ------------
                """
                {
                    {model_group}_map: {
                        id: {
                            "cost": [..]
                            f"{date:hour:minute}" : {"tpm": 34, "rpm": 3}
                        }
                    }
                }
                """
                cost_key = f"{model_group}_map"
                current_date = datetime.now().strftime("%Y-%m-%d")
                current_hour = datetime.now().strftime("%H")
                current_minute = datetime.now().strftime("%M")
                precise_minute = f"{current_date}-{current_hour}-{current_minute}"
                response_ms: timedelta = end_time - start_time
                final_value = response_ms
                total_tokens = 0
                if isinstance(response_obj, ModelResponse):
                    completion_tokens = response_obj.usage.completion_tokens
                    total_tokens = response_obj.usage.total_tokens
                    final_value = float(response_ms.total_seconds() / completion_tokens)
                # ------------
                # Update usage
                # ------------
                request_count_dict = (
                    await self.router_cache.async_get_cache(key=cost_key) or {}
                )
                if id not in request_count_dict:
                    request_count_dict[id] = {}
                if precise_minute not in request_count_dict[id]:
                    request_count_dict[id][precise_minute] = {}
                ## TPM
                request_count_dict[id][precise_minute]["tpm"] = (
                    request_count_dict[id][precise_minute].get("tpm", 0) + total_tokens
                )
                ## RPM
                request_count_dict[id][precise_minute]["rpm"] = (
                    request_count_dict[id][precise_minute].get("rpm", 0) + 1
                )
                await self.router_cache.async_set_cache(
                    key=cost_key, value=request_count_dict
                )  # reset map within window
                ### TESTING ###
                if self.test_flag:
                    self.logged_success += 1
        except Exception as e:
            traceback.print_exc()
            pass
    async def async_get_available_deployments(
        self,
        model_group: str,
        healthy_deployments: list,
        messages: Optional[List[Dict[str, str]]] = None,
        input: Optional[Union[str, List]] = None,
        request_kwargs: Optional[Dict] = None,
    ):
        """
        Returns a deployment with the lowest cost
        """
        cost_key = f"{model_group}_map"
        request_count_dict = await self.router_cache.async_get_cache(key=cost_key) or {}
        # -----------------------
        # Find lowest used model
        # ----------------------
        lowest_cost = float("inf")
        current_date = datetime.now().strftime("%Y-%m-%d")
        current_hour = datetime.now().strftime("%H")
        current_minute = datetime.now().strftime("%M")
        precise_minute = f"{current_date}-{current_hour}-{current_minute}"
        deployment = None
        if request_count_dict is None:  # base case
            return
        all_deployments = request_count_dict
        for d in healthy_deployments:
            ## if healthy deployment not yet used
            if d["model_info"]["id"] not in all_deployments:
                all_deployments[d["model_info"]["id"]] = {
                    precise_minute: {"tpm": 0, "rpm": 0},
                }
        try:
            input_tokens = token_counter(messages=messages, text=input)
        except:
            input_tokens = 0
        # randomly sample from all_deployments, incase all deployments have latency=0.0
        _items = all_deployments.items()
        ### GET AVAILABLE DEPLOYMENTS ### filter out any deployments > tpm/rpm limits
        potential_deployments = []
        _cost_per_deployment = {}
        for item, item_map in all_deployments.items():
            ## get the item from model list
            _deployment = None
            for m in healthy_deployments:
                if item == m["model_info"]["id"]:
                    _deployment = m
            if _deployment is None:
                continue  # skip to next one
            _deployment_tpm = (
                _deployment.get("tpm", None)
                or _deployment.get("litellm_params", {}).get("tpm", None)
                or _deployment.get("model_info", {}).get("tpm", None)
                or float("inf")
            )
            _deployment_rpm = (
                _deployment.get("rpm", None)
                or _deployment.get("litellm_params", {}).get("rpm", None)
                or _deployment.get("model_info", {}).get("rpm", None)
                or float("inf")
            )
            item_litellm_model_name = _deployment.get("litellm_params", {}).get("model")
            item_litellm_model_cost_map = litellm.model_cost.get(
                item_litellm_model_name, {}
            )
            # check if user provided input_cost_per_token and output_cost_per_token in litellm_params
            item_input_cost = None
            item_output_cost = None
            if _deployment.get("litellm_params", {}).get("input_cost_per_token", None):
                item_input_cost = _deployment.get("litellm_params", {}).get(
                    "input_cost_per_token"
                )
            if _deployment.get("litellm_params", {}).get("output_cost_per_token", None):
                item_output_cost = _deployment.get("litellm_params", {}).get(
                    "output_cost_per_token"
                )
            if item_input_cost is None:
                item_input_cost = item_litellm_model_cost_map.get(
                    "input_cost_per_token", 5.0
                )
            if item_output_cost is None:
                item_output_cost = item_litellm_model_cost_map.get(
                    "output_cost_per_token", 5.0
                )
            # if litellm["model"] is not in model_cost map -> use item_cost = $10
            item_cost = item_input_cost + item_output_cost
            item_rpm = item_map.get(precise_minute, {}).get("rpm", 0)
            item_tpm = item_map.get(precise_minute, {}).get("tpm", 0)
            verbose_router_logger.debug(
                f"item_cost: {item_cost}, item_tpm: {item_tpm}, item_rpm: {item_rpm}, model_id: {_deployment.get('model_info', {}).get('id')}"
            )
            # -------------- #
            # Debugging Logic
            # -------------- #
            # We use _cost_per_deployment to log to langfuse, slack - this is not used to make a decision on routing
            # this helps a user to debug why the router picked a specfic deployment      #
            _deployment_api_base = _deployment.get("litellm_params", {}).get(
                "api_base", ""
            )
            if _deployment_api_base is not None:
                _cost_per_deployment[_deployment_api_base] = item_cost
            # -------------- #
            # End of Debugging Logic
            # -------------- #
            if (
                item_tpm + input_tokens > _deployment_tpm
                or item_rpm + 1 > _deployment_rpm
            ):  # if user passed in tpm / rpm in the model_list
                continue
            else:
                potential_deployments.append((_deployment, item_cost))
        if len(potential_deployments) == 0:
            return None
        potential_deployments = sorted(potential_deployments, key=lambda x: x[1])
        selected_deployment = potential_deployments[0][0]
        return selected_deployment
--- a/litellm/router_strategy/lowest_latency.py
+++ b/litellm/router_strategy/lowest_latency.py
@ -1,7 +1,7 @@
 #### What this does ####
 #   picks based on response time (for streaming, this is time to first token)
-from pydantic import BaseModel, Extra, Field, root_validator
+from pydantic import BaseModel, Extra, Field, root_validator  # type: ignore
-import dotenv, os, requests, random
+import dotenv, os, requests, random  # type: ignore
 from typing import Optional, Union, List, Dict
 from datetime import datetime, timedelta
 import random
--- a/litellm/tests/langfuse.log
+++ b/litellm/tests/langfuse.log
--- a/litellm/tests/test_acompletion.py
+++ b/litellm/tests/test_acompletion.py
@ -1,5 +1,6 @@
 import pytest
 from litellm import acompletion
 from litellm import completion
 def test_acompletion_params():
@ -7,17 +8,29 @@ def test_acompletion_params():
    from litellm.types.completion import CompletionRequest
    acompletion_params_odict = inspect.signature(acompletion).parameters
-    acompletion_params = {name: param.annotation for name, param in acompletion_params_odict.items()}
+    completion_params_dict = inspect.signature(completion).parameters
    completion_params = {field_name: field_type for field_name, field_type in CompletionRequest.__annotations__.items()}
-    # remove kwargs
+    acompletion_params = {
-    acompletion_params.pop("kwargs", None)
+        name: param.annotation for name, param in acompletion_params_odict.items()
    }
    completion_params = {
        name: param.annotation for name, param in completion_params_dict.items()
    }
    keys_acompletion = set(acompletion_params.keys())
    keys_completion = set(completion_params.keys())
    print(keys_acompletion)
    print("\n\n\n")
    print(keys_completion)
    print("diff=", keys_completion - keys_acompletion)
    # Assert that the parameters are the same
    if keys_acompletion != keys_completion:
-        pytest.fail("The parameters of the acompletion function and the CompletionRequest class are not the same.")
+        pytest.fail(
            "The parameters of the litellm.acompletion function and litellm.completion are not the same."
        )
 # test_acompletion_params()
--- a/litellm/tests/test_alangfuse.py
+++ b/litellm/tests/test_alangfuse.py
@ -1,9 +1,11 @@
 import copy
 import json
 import sys
 import os
-import io, asyncio
+import asyncio
 import logging
 from unittest.mock import MagicMock, patch
 logging.basicConfig(level=logging.DEBUG)
 sys.path.insert(0, os.path.abspath("../.."))
@ -18,6 +20,21 @@ import time
 import pytest
@pytest.fixture
 def langfuse_client():
    import langfuse
    langfuse_client = langfuse.Langfuse(
        public_key=os.environ["LANGFUSE_PUBLIC_KEY"],
        secret_key=os.environ["LANGFUSE_SECRET_KEY"],
    )
    with patch(
        "langfuse.Langfuse", MagicMock(return_value=langfuse_client)
    ) as mock_langfuse_client:
        yield mock_langfuse_client()
 def search_logs(log_file_path, num_good_logs=1):
    """
    Searches the given log file for logs containing the "/api/public" string.
@ -129,21 +146,10 @@ def test_langfuse_logging_async():
        pytest.fail(f"An exception occurred - {e}")
-async def make_async_calls():
+async def make_async_calls(metadata=None, **completion_kwargs):
    tasks = []
    for _ in range(5):
-        task = asyncio.create_task(
+        tasks.append(create_async_task())
            litellm.acompletion(
                model="azure/chatgpt-v-2",
                messages=[{"role": "user", "content": "This is a test"}],
                max_tokens=5,
                temperature=0.7,
                timeout=5,
                user="langfuse_latency_test_user",
                mock_response="It's simple to use and easy to get started",
            )
        )
        tasks.append(task)
    # Measure the start time before running the tasks
    start_time = asyncio.get_event_loop().time()
@ -161,9 +167,30 @@ async def make_async_calls():
    return total_time
 def create_async_task(**completion_kwargs):
    """
    Creates an async task for the litellm.acompletion function.
    This is just the task, but it is not run here.
    To run the task it must be awaited or used in other asyncio coroutine execution functions like asyncio.gather.
    Any kwargs passed to this function will be passed to the litellm.acompletion function.
    By default a standard set of arguments are used for the litellm.acompletion function.
    """
    completion_args = {
        "model": "azure/chatgpt-v-2",
        "messages": [{"role": "user", "content": "This is a test"}],
        "max_tokens": 5,
        "temperature": 0.7,
        "timeout": 5,
        "user": "langfuse_latency_test_user",
        "mock_response": "It's simple to use and easy to get started",
    }
    completion_args.update(completion_kwargs)
    return asyncio.create_task(litellm.acompletion(**completion_args))
@pytest.mark.asyncio
@pytest.mark.parametrize("stream", [False, True])
-async def test_langfuse_logging_without_request_response(stream):
+async def test_langfuse_logging_without_request_response(stream, langfuse_client):
    try:
        import uuid
@ -171,12 +198,8 @@ async def test_langfuse_logging_without_request_response(stream):
        litellm.set_verbose = True
        litellm.turn_off_message_logging = True
        litellm.success_callback = ["langfuse"]
-        response = await litellm.acompletion(
+        response = await create_async_task(
            model="gpt-3.5-turbo",
            mock_response="It's simple to use and easy to get started",
            messages=[{"role": "user", "content": "Hi 👋 - i'm claude"}],
            max_tokens=10,
            temperature=0.2,
            stream=stream,
            metadata={"trace_id": _unique_trace_name},
        )
@ -185,14 +208,8 @@ async def test_langfuse_logging_without_request_response(stream):
            async for chunk in response:
                print(chunk)
-        await asyncio.sleep(3)
+        langfuse_client.flush()
-
+        await asyncio.sleep(2)
        import langfuse
        langfuse_client = langfuse.Langfuse(
            public_key=os.environ["LANGFUSE_PUBLIC_KEY"],
            secret_key=os.environ["LANGFUSE_SECRET_KEY"],
        )
        # get trace with _unique_trace_name
        trace = langfuse_client.get_generations(trace_id=_unique_trace_name)
@ -211,6 +228,123 @@ async def test_langfuse_logging_without_request_response(stream):
        pytest.fail(f"An exception occurred - {e}")
@pytest.mark.asyncio
 async def test_langfuse_logging_metadata(langfuse_client):
    """
    Test that creates multiple traces, with a varying number of generations and sets various metadata fields
    Confirms that no metadata that is standard within Langfuse is duplicated in the respective trace or generation metadata
    For trace continuation certain metadata of the trace is overriden with metadata from the last generation based on the update_trace_keys field
    Version is set for both the trace and the generation
    Release is just set for the trace
    Tags is just set for the trace
    """
    import uuid
    litellm.set_verbose = True
    litellm.success_callback = ["langfuse"]
    trace_identifiers = {}
    expected_filtered_metadata_keys = {
        "trace_name",
        "trace_id",
        "existing_trace_id",
        "trace_user_id",
        "session_id",
        "tags",
        "generation_name",
        "generation_id",
        "prompt",
    }
    trace_metadata = {
        "trace_actual_metadata_key": "trace_actual_metadata_value"
    }  # Allows for setting the metadata on the trace
    run_id = str(uuid.uuid4())
    session_id = f"litellm-test-session-{run_id}"
    trace_common_metadata = {
        "session_id": session_id,
        "tags": ["litellm-test-tag1", "litellm-test-tag2"],
        "update_trace_keys": [
            "output",
            "trace_metadata",
        ],  # Overwrite the following fields in the trace with the last generation's output and the trace_user_id
        "trace_metadata": trace_metadata,
        "gen_metadata_key": "gen_metadata_value",  # Metadata key that should not be filtered in the generation
        "trace_release": "litellm-test-release",
        "version": "litellm-test-version",
    }
    for trace_num in range(1, 3):  # Two traces
        metadata = copy.deepcopy(trace_common_metadata)
        trace_id = f"litellm-test-trace{trace_num}-{run_id}"
        metadata["trace_id"] = trace_id
        metadata["trace_name"] = trace_id
        trace_identifiers[trace_id] = []
        print(f"Trace: {trace_id}")
        for generation_num in range(
            1, trace_num + 1
        ):  # Each trace has a number of generations equal to its trace number
            metadata["trace_user_id"] = f"litellm-test-user{generation_num}-{run_id}"
            generation_id = (
                f"litellm-test-trace{trace_num}-generation-{generation_num}-{run_id}"
            )
            metadata["generation_id"] = generation_id
            metadata["generation_name"] = generation_id
            metadata["trace_metadata"][
                "generation_id"
            ] = generation_id  # Update to test if trace_metadata is overwritten by update trace keys
            trace_identifiers[trace_id].append(generation_id)
            print(f"Generation: {generation_id}")
            response = await create_async_task(
                model="gpt-3.5-turbo",
                mock_response=f"{session_id}:{trace_id}:{generation_id}",
                messages=[
                    {
                        "role": "user",
                        "content": f"{session_id}:{trace_id}:{generation_id}",
                    }
                ],
                max_tokens=100,
                temperature=0.2,
                metadata=copy.deepcopy(
                    metadata
                ),  # Every generation needs its own metadata, langfuse is not async/thread safe without it
            )
            print(response)
            metadata["existing_trace_id"] = trace_id
    langfuse_client.flush()
    await asyncio.sleep(2)
    # Tests the metadata filtering and the override of the output to be the last generation
    for trace_id, generation_ids in trace_identifiers.items():
        trace = langfuse_client.get_trace(id=trace_id)
        assert trace.id == trace_id
        assert trace.session_id == session_id
        assert trace.metadata != trace_metadata
        generations = list(
            reversed(langfuse_client.get_generations(trace_id=trace_id).data)
        )
        assert len(generations) == len(generation_ids)
        assert (
            trace.input == generations[0].input
        )  # Should be set by the first generation
        assert (
            trace.output == generations[-1].output
        )  # Should be overwritten by the last generation according to update_trace_keys
        assert (
            trace.metadata != generations[-1].metadata
        )  # Should be overwritten by the last generation according to update_trace_keys
        assert trace.metadata["generation_id"] == generations[-1].id
        assert set(trace.tags).issuperset(trace_common_metadata["tags"])
        print("trace_from_langfuse", trace)
        for generation_id, generation in zip(generation_ids, generations):
            assert generation.id == generation_id
            assert generation.trace_id == trace_id
            assert set(generation.metadata.keys()).isdisjoint(
                expected_filtered_metadata_keys
            )
            print("generation_from_langfuse", generation)
@pytest.mark.skip(reason="beta test - checking langfuse output")
 def test_langfuse_logging():
    try:
@ -570,6 +704,10 @@ def test_langfuse_existing_trace_id():
    assert initial_langfuse_trace_dict == new_langfuse_trace_dict
@pytest.mark.skipif(
    condition=not os.environ.get("OPENAI_API_KEY", False),
    reason="Authentication missing for openai",
 )
 def test_langfuse_logging_tool_calling():
    litellm.set_verbose = True
--- a/litellm/tests/test_alerting.py
+++ b/litellm/tests/test_alerting.py
@ -1,7 +1,7 @@
 # What is this?
 ## Tests slack alerting on proxy logging object
-import sys
+import sys, json
 import os
 import io, asyncio
 from datetime import datetime, timedelta
@ -10,14 +10,18 @@ from datetime import datetime, timedelta
 # logging.basicConfig(level=logging.DEBUG)
 sys.path.insert(0, os.path.abspath("../.."))
 from litellm.proxy.utils import ProxyLogging
-from litellm.caching import DualCache
+from litellm.caching import DualCache, RedisCache
 import litellm
 import pytest
 import asyncio
 from unittest.mock import patch, MagicMock
 from litellm.utils import get_api_base
 from litellm.caching import DualCache
-from litellm.integrations.slack_alerting import SlackAlerting
+from litellm.integrations.slack_alerting import SlackAlerting, DeploymentMetrics
 import unittest.mock
 from unittest.mock import AsyncMock
 import pytest
 from litellm.router import AlertingConfig, Router
@pytest.mark.parametrize(
@ -61,7 +65,7 @@ async def test_get_api_base():
    end_time = datetime.now()
    time_difference_float, model, api_base, messages = (
-        _pl.slack_alerting_instance._response_taking_too_long_callback(
+        _pl.slack_alerting_instance._response_taking_too_long_callback_helper(
            kwargs={
                "model": model,
                "messages": messages,
@ -98,7 +102,10 @@ def mock_env(monkeypatch):
 # Test the __init__ method
 def test_init():
    slack_alerting = SlackAlerting(
-        alerting_threshold=32, alerting=["slack"], alert_types=["llm_exceptions"]
+        alerting_threshold=32,
        alerting=["slack"],
        alert_types=["llm_exceptions"],
        internal_usage_cache=DualCache(),
    )
    assert slack_alerting.alerting_threshold == 32
    assert slack_alerting.alerting == ["slack"]
@ -116,7 +123,7 @@ from datetime import datetime, timedelta
@pytest.fixture
 def slack_alerting():
-    return SlackAlerting(alerting_threshold=1)
+    return SlackAlerting(alerting_threshold=1, internal_usage_cache=DualCache())
 # Test for hanging LLM responses
@ -185,3 +192,170 @@ async def test_send_alert(slack_alerting):
        mock_post.return_value.status_code = 200
        await slack_alerting.send_alert("Test message", "Low", "budget_alerts")
        mock_post.assert_awaited_once()
@pytest.mark.asyncio
 async def test_daily_reports_unit_test(slack_alerting):
    with patch.object(slack_alerting, "send_alert", new=AsyncMock()) as mock_send_alert:
        router = litellm.Router(
            model_list=[
                {
                    "model_name": "test-gpt",
                    "litellm_params": {"model": "gpt-3.5-turbo"},
                    "model_info": {"id": "1234"},
                }
            ]
        )
        deployment_metrics = DeploymentMetrics(
            id="1234",
            failed_request=False,
            latency_per_output_token=20.3,
            updated_at=litellm.utils.get_utc_datetime(),
        )
        updated_val = await slack_alerting.async_update_daily_reports(
            deployment_metrics=deployment_metrics
        )
        assert updated_val == 1
        await slack_alerting.send_daily_reports(router=router)
        mock_send_alert.assert_awaited_once()
@pytest.mark.asyncio
 async def test_daily_reports_completion(slack_alerting):
    with patch.object(slack_alerting, "send_alert", new=AsyncMock()) as mock_send_alert:
        litellm.callbacks = [slack_alerting]
        # on async success
        router = litellm.Router(
            model_list=[
                {
                    "model_name": "gpt-5",
                    "litellm_params": {
                        "model": "gpt-3.5-turbo",
                    },
                }
            ]
        )
        await router.acompletion(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": "Hey, how's it going?"}],
        )
        await asyncio.sleep(3)
        response_val = await slack_alerting.send_daily_reports(router=router)
        assert response_val == True
        mock_send_alert.assert_awaited_once()
        # on async failure
        router = litellm.Router(
            model_list=[
                {
                    "model_name": "gpt-5",
                    "litellm_params": {"model": "gpt-3.5-turbo", "api_key": "bad_key"},
                }
            ]
        )
        try:
            await router.acompletion(
                model="gpt-3.5-turbo",
                messages=[{"role": "user", "content": "Hey, how's it going?"}],
            )
        except Exception as e:
            pass
        await asyncio.sleep(3)
        response_val = await slack_alerting.send_daily_reports(router=router)
        assert response_val == True
        mock_send_alert.assert_awaited()
@pytest.mark.asyncio
 async def test_daily_reports_redis_cache_scheduler():
    redis_cache = RedisCache()
    slack_alerting = SlackAlerting(
        internal_usage_cache=DualCache(redis_cache=redis_cache)
    )
    router = litellm.Router(
        model_list=[
            {
                "model_name": "gpt-5",
                "litellm_params": {
                    "model": "gpt-3.5-turbo",
                },
            }
        ]
    )
    with patch.object(
        slack_alerting, "send_alert", new=AsyncMock()
    ) as mock_send_alert, patch.object(
        redis_cache, "async_set_cache", new=AsyncMock()
    ) as mock_redis_set_cache:
        # initial call - expect empty
        await slack_alerting._run_scheduler_helper(llm_router=router)
        try:
            json.dumps(mock_redis_set_cache.call_args[0][1])
        except Exception as e:
            pytest.fail(
                "Cache value can't be json dumped - {}".format(
                    mock_redis_set_cache.call_args[0][1]
                )
            )
        mock_redis_set_cache.assert_awaited_once()
        # second call - expect empty
        await slack_alerting._run_scheduler_helper(llm_router=router)
@pytest.mark.asyncio
@pytest.mark.skip(reason="Local test. Test if slack alerts are sent.")
 async def test_send_llm_exception_to_slack():
    from litellm.router import AlertingConfig
    # on async success
    router = litellm.Router(
        model_list=[
            {
                "model_name": "gpt-3.5-turbo",
                "litellm_params": {
                    "model": "gpt-3.5-turbo",
                    "api_key": "bad_key",
                },
            },
            {
                "model_name": "gpt-5-good",
                "litellm_params": {
                    "model": "gpt-3.5-turbo",
                },
            },
        ],
        alerting_config=AlertingConfig(
            alerting_threshold=0.5, webhook_url=os.getenv("SLACK_WEBHOOK_URL")
        ),
    )
    try:
        await router.acompletion(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": "Hey, how's it going?"}],
        )
    except:
        pass
    await router.acompletion(
        model="gpt-5-good",
        messages=[{"role": "user", "content": "Hey, how's it going?"}],
    )
    await asyncio.sleep(3)
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -118,6 +118,7 @@ def test_completion_claude():
 def test_completion_claude_3_empty_response():
    litellm.set_verbose = True
    messages = [
        {
            "role": "system",
@ -2167,9 +2168,9 @@ def test_completion_replicate_vicuna():
 def test_replicate_custom_prompt_dict():
    litellm.set_verbose = True
-    model_name = "replicate/meta/llama-2-70b-chat"
+    model_name = "replicate/meta/llama-2-7b"
    litellm.register_prompt_template(
-        model="replicate/meta/llama-2-70b-chat",
+        model="replicate/meta/llama-2-7b",
        initial_prompt_value="You are a good assistant",  # [OPTIONAL]
        roles={
            "system": {
@ -2199,6 +2200,7 @@ def test_replicate_custom_prompt_dict():
            repetition_penalty=0.1,
            num_retries=3,
        )
    except litellm.APIError as e:
        pass
    except litellm.APIConnectionError as e:
@ -3016,6 +3018,21 @@ async def test_acompletion_gemini():
            pytest.fail(f"Error occurred: {e}")
 # Deepseek tests
 def test_completion_deepseek():
    litellm.set_verbose = True
    model_name = "deepseek/deepseek-chat"
    messages = [{"role": "user", "content": "Hey, how's it going?"}]
    try:
        response = completion(model=model_name, messages=messages)
        # Add any assertions here to check the response
        print(response)
    except litellm.APIError as e:
        pass
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 # Palm tests
 def test_completion_palm():
    litellm.set_verbose = True
--- a/litellm/tests/test_completion_cost.py
+++ b/litellm/tests/test_completion_cost.py
@ -231,14 +231,17 @@ def test_cost_bedrock_pricing():
    assert cost == predicted_cost
@pytest.mark.skip(reason="AWS disabled our access")
 def test_cost_bedrock_pricing_actual_calls():
    litellm.set_verbose = True
    model = "anthropic.claude-instant-v1"
    messages = [{"role": "user", "content": "Hey, how's it going?"}]
-    response = litellm.completion(model=model, messages=messages)
+    response = litellm.completion(
-    assert response._hidden_params["region_name"] is not None
+        model=model, messages=messages, mock_response="hello cool one"
    )
    print("response", response)
    cost = litellm.completion_cost(
        model="bedrock/anthropic.claude-instant-v1",
        completion_response=response,
        messages=[{"role": "user", "content": "Hey, how's it going?"}],
    )
--- a/litellm/tests/test_config.py
+++ b/litellm/tests/test_config.py
@ -140,6 +140,8 @@ async def test_add_existing_deployment():
            deployment_2.to_json(exclude_none=True),
        ]
    )
    init_len_list = len(llm_router.model_list)
    print(f"llm_router: {llm_router}")
    master_key = "sk-1234"
    setattr(litellm.proxy.proxy_server, "llm_router", llm_router)
@ -164,7 +166,7 @@ async def test_add_existing_deployment():
    db_models = [db_model]
    num_added = pc._add_deployment(db_models=db_models)
-    assert num_added == 0
+    assert init_len_list == len(llm_router.model_list)
 litellm_params = LiteLLM_Params(
--- a/Show more
+++ b/Show more
		`@ -1 +1 @@`
			!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function d(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={id:e,loaded:!1,exports:{}},r=!0;try{a[e](n,n.exports,d),r=!1}finally{r&&delete l[e]}return n.loaded=!0,n.exports}d.m=a,e=[],d.O=function(t,n,r,o){if(n){o=o\|\|0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(d.O).every(function(e){return d.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},d.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return d.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},d.t=function(e,r){if(1&r&&(e=this(e)),8&r\|\|"object"==typeof e&&e&&(4&r&&e.__esModule\|\|16&r&&"function"==typeof e.then))return e;var o=Object.create(null);d.r(o);var u={};t=t\|\|[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},d.d(o,u),o},d.d=function(e,t){for(var n in t)d.o(t,n)&&!d.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},d.f={},d.e=function(e){return Promise.all(Object.keys(d.f).reduce(function(t,n){return d.f[n](e,t),t},[]))},d.u=function(e){},d.miniCssF=function(e){return"static/css/00c2ddbcd01819c0.css"},d.g=function(){if("object"==typeof globalThis)return globalThis;try{return this\|\|Function("return this")()}catch(e){if("object"==typeof window)return window}}(),d.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",d.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e\|\|l.getAttribute("data-webpack")==o+n){i=l;break}}i\|\|(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,d.nc&&i.setAttribute("nonce",d.nc),i.setAttribute("data-webpack",o+n),i.src=d.tu(e)),r[e]=[t];var s=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(s.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=s.bind(null,i.onerror),i.onload=s.bind(null,i.onload),c&&document.head.appendChild(i)},d.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.nmd=function(e){return e.paths=[],e.children\|\|(e.children=[]),e},d.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},d.tu=function(e){return d.tt().createScriptURL(e)},d.p="/ui/_next/",i={272:0},d.f.j=function(e,t){var n=d.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=d.p+d.u(e),u=Error();d.l(o,function(t){if(d.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},d.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)d.o(u,n)&&(d.m[n]=u[n]);if(c)var a=c(d)}for(e&&e(t);f<o.length;f++)r=o[f],d.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return d.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E\|\|[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();				!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function d(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={id:e,loaded:!1,exports:{}},r=!0;try{a[e](n,n.exports,d),r=!1}finally{r&&delete l[e]}return n.loaded=!0,n.exports}d.m=a,e=[],d.O=function(t,n,r,o){if(n){o=o\|\|0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(d.O).every(function(e){return d.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},d.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return d.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},d.t=function(e,r){if(1&r&&(e=this(e)),8&r\|\|"object"==typeof e&&e&&(4&r&&e.__esModule\|\|16&r&&"function"==typeof e.then))return e;var o=Object.create(null);d.r(o);var u={};t=t\|\|[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},d.d(o,u),o},d.d=function(e,t){for(var n in t)d.o(t,n)&&!d.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},d.f={},d.e=function(e){return Promise.all(Object.keys(d.f).reduce(function(t,n){return d.f[n](e,t),t},[]))},d.u=function(e){},d.miniCssF=function(e){return"static/css/a1602eb39f799143.css"},d.g=function(){if("object"==typeof globalThis)return globalThis;try{return this\|\|Function("return this")()}catch(e){if("object"==typeof window)return window}}(),d.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",d.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e\|\|l.getAttribute("data-webpack")==o+n){i=l;break}}i\|\|(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,d.nc&&i.setAttribute("nonce",d.nc),i.setAttribute("data-webpack",o+n),i.src=d.tu(e)),r[e]=[t];var s=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(s.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=s.bind(null,i.onerror),i.onload=s.bind(null,i.onload),c&&document.head.appendChild(i)},d.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.nmd=function(e){return e.paths=[],e.children\|\|(e.children=[]),e},d.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},d.tu=function(e){return d.tt().createScriptURL(e)},d.p="/ui/_next/",i={272:0},d.f.j=function(e,t){var n=d.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=d.p+d.u(e),u=Error();d.l(o,function(t){if(d.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},d.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)d.o(u,n)&&(d.m[n]=u[n]);if(c)var a=c(d)}for(e&&e(t);f<o.length;f++)r=o[f],d.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return d.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E\|\|[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();
		`@ -1 +1 @@`
			<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-202e312607f242a1.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-dafd44dfa2da140c.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-e49705773ae41779.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-202e312607f242a1.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f\|\|[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/00c2ddbcd01819c0.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[58854,[\"936\",\"static/chunks/2f6dbc85-17d29013b8ff3da5.js\",\"142\",\"static/chunks/142-11990a208bf93746.js\",\"931\",\"static/chunks/app/page-d9bdfedbff191985.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/00c2ddbcd01819c0.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"e55gTzpa2g2-9SwXgA9Uo\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>				<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-5b257e1ab47d4b4a.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-dafd44dfa2da140c.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-e49705773ae41779.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-5b257e1ab47d4b4a.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f\|\|[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/a1602eb39f799143.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[25539,[\"936\",\"static/chunks/2f6dbc85-17d29013b8ff3da5.js\",\"566\",\"static/chunks/566-ccd699ab19124658.js\",\"931\",\"static/chunks/app/page-c804e862b63be987.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/a1602eb39f799143.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"K8KXTbmuI2ArWjjdMi2iq\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>