added changes from upstream

Merge branch 'main' into fix/error-on-get-user-role
2024-05-09 16:14:14 -07:00 · 2024-05-09 16:14:14 -07:00 · d3a228d03b
commit d3a228d03b
parent c42f1ce2c6 43b2050cc2
142 changed files with 4439 additions and 801 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -188,7 +188,7 @@ jobs:
          command: |
            docker run -d \
              -p 4000:4000 \
-              -e DATABASE_URL=$PROXY_DOCKER_DB_URL \
+              -e DATABASE_URL=$PROXY_DATABASE_URL \
              -e AZURE_API_KEY=$AZURE_API_KEY \
              -e REDIS_HOST=$REDIS_HOST \
              -e REDIS_PASSWORD=$REDIS_PASSWORD \
@ -223,7 +223,7 @@ jobs:
          background: true
      - run: 
          name: Wait for app to be ready
-          command: dockerize -wait http://localhost:4000 -timeout 1m
+          command: dockerize -wait http://localhost:4000 -timeout 5m
      - run:
          name: Run tests
          command: |
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@ -0,0 +1,51 @@
+{
+	"name": "Python 3.11",
+	// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
+	"image": "mcr.microsoft.com/devcontainers/python:3.11-bookworm",
+	// https://github.com/devcontainers/images/tree/main/src/python
+	// https://mcr.microsoft.com/en-us/product/devcontainers/python/tags
+
+	// "build": {
+	// 	"dockerfile": "Dockerfile",
+	// 	"context": ".."
+	// },
+
+	// Features to add to the dev container. More info: https://containers.dev/features.
+	// "features": {},
+
+	// Configure tool-specific properties.
+	"customizations": {
+		// Configure properties specific to VS Code.
+		"vscode": {
+			"settings": {},
+			"extensions": [
+				"ms-python.python",
+				"ms-python.vscode-pylance",
+				"GitHub.copilot",
+				"GitHub.copilot-chat"
+			]
+		}
+	},
+	
+	// Use 'forwardPorts' to make a list of ports inside the container available locally.
+	"forwardPorts": [4000],
+
+	"containerEnv": {
+		"LITELLM_LOG": "DEBUG"
+	},
+
+	// Use 'portsAttributes' to set default properties for specific forwarded ports. 
+	// More info: https://containers.dev/implementors/json_reference/#port-attributes
+	"portsAttributes": {
+		"4000": {
+			"label": "LiteLLM Server",
+			"onAutoForward": "notify"
+		}
+	},
+
+	// More info: https://aka.ms/dev-containers-non-root.
+	// "remoteUser": "litellm",
+
+	// Use 'postCreateCommand' to run commands after the container is created.
+	"postCreateCommand": "pipx install poetry && poetry install -E extra_proxy -E proxy"
+}
--- a/.github/workflows/interpret_load_test.py
+++ b/.github/workflows/interpret_load_test.py
@ -64,6 +64,11 @@ if __name__ == "__main__":
    )  # Replace with your repository's username and name
    latest_release = repo.get_latest_release()
    print("got latest release: ", latest_release)
+    print(latest_release.title)
+    print(latest_release.tag_name)
+
+    release_version = latest_release.title
+
    print("latest release body: ", latest_release.body)
    print("markdown table: ", markdown_table)

@ -74,8 +79,22 @@ if __name__ == "__main__":
        start_index = latest_release.body.find("Load Test LiteLLM Proxy Results")
        existing_release_body = latest_release.body[:start_index]

+    docker_run_command = f"""
+\n\n
+## Docker Run LiteLLM Proxy
+
+```
+docker run \\
+-e STORE_MODEL_IN_DB=True \\
+-p 4000:4000 \\
+ghcr.io/berriai/litellm:main-{release_version}
+```
+    """
+    print("docker run command: ", docker_run_command)
+
    new_release_body = (
        existing_release_body
+        + docker_run_command
        + "\n\n"
        + "### Don't want to maintain your internal proxy? get in touch 🎉"
        + "\nHosted Proxy Alpha: https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat"
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,6 @@
 .venv
 .env
+litellm/proxy/myenv/*
 litellm_uuid.txt
 __pycache__/
 *.pyc
@ -52,3 +53,6 @@ litellm/proxy/_new_secret_config.yaml
 litellm/proxy/_new_secret_config.yaml
 litellm/proxy/_super_secret_config.yaml
 litellm/proxy/_super_secret_config.yaml
+litellm/proxy/myenv/bin/activate
+litellm/proxy/myenv/bin/Activate.ps1
+myenv/*
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -16,11 +16,11 @@ repos:
        name: Check if files match
        entry: python3 ci_cd/check_files_match.py
        language: system
-   repo: local
-    hooks:
-    -   id: mypy
-        name: mypy
-        entry: python3 -m mypy --ignore-missing-imports
-        language: system
-        types: [python]
-        files: ^litellm/
+# -   repo: local
+#     hooks:
+#     -   id: mypy
+#         name: mypy
+#         entry: python3 -m mypy --ignore-missing-imports
+#         language: system
+#         types: [python]
+#         files: ^litellm/
--- a/README.md
+++ b/README.md
@ -226,6 +226,7 @@ curl 'http://0.0.0.0:4000/key/generate' \
 | [deepinfra](https://docs.litellm.ai/docs/providers/deepinfra)                       | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
 | [perplexity-ai](https://docs.litellm.ai/docs/providers/perplexity)                  | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
 | [Groq AI](https://docs.litellm.ai/docs/providers/groq)                              | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
+| [Deepseek](https://docs.litellm.ai/docs/providers/deepseek)                         | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
 | [anyscale](https://docs.litellm.ai/docs/providers/anyscale)                         | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
 | [IBM - watsonx.ai](https://docs.litellm.ai/docs/providers/watsonx)                  | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                | ✅ 
 | [voyage ai](https://docs.litellm.ai/docs/providers/voyage)                          |                                                         |                                                                                 |                                                                                     |                                                                                   | ✅                                                                            |
--- a/deploy/azure_resource_manager/azure_marketplace.zip
+++ b/deploy/azure_resource_manager/azure_marketplace.zip
--- a/deploy/azure_resource_manager/azure_marketplace/createUiDefinition.json
+++ b/deploy/azure_resource_manager/azure_marketplace/createUiDefinition.json
@ -0,0 +1,15 @@
+{
+    "$schema": "https://schema.management.azure.com/schemas/0.1.2-preview/CreateUIDefinition.MultiVm.json#",
+    "handler": "Microsoft.Azure.CreateUIDef",
+    "version": "0.1.2-preview",
+    "parameters": {
+        "config": {
+            "isWizard": false,
+            "basics": { }
+        },
+        "basics": [ ],
+        "steps": [ ],
+        "outputs": { },
+        "resourceTypes": [ ]
+    }
+}
--- a/deploy/azure_resource_manager/azure_marketplace/mainTemplate.json
+++ b/deploy/azure_resource_manager/azure_marketplace/mainTemplate.json
@ -0,0 +1,63 @@
+{
+    "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
+    "contentVersion": "1.0.0.0",
+    "parameters": {
+      "imageName": {
+        "type": "string",
+        "defaultValue": "ghcr.io/berriai/litellm:main-latest"
+      },
+      "containerName": {
+        "type": "string",
+        "defaultValue": "litellm-container"
+      },
+      "dnsLabelName": {
+        "type": "string",
+        "defaultValue": "litellm"
+      },
+      "portNumber": {
+        "type": "int",
+        "defaultValue": 4000
+      }
+    },
+    "resources": [
+      {
+        "type": "Microsoft.ContainerInstance/containerGroups",
+        "apiVersion": "2021-03-01",
+        "name": "[parameters('containerName')]",
+        "location": "[resourceGroup().location]",
+        "properties": {
+          "containers": [
+            {
+              "name": "[parameters('containerName')]",
+              "properties": {
+                "image": "[parameters('imageName')]",
+                "resources": {
+                  "requests": {
+                    "cpu": 1,
+                    "memoryInGB": 2
+                  }
+                },
+                "ports": [
+                  {
+                    "port": "[parameters('portNumber')]"
+                  }
+                ]
+              }
+            }
+          ],
+          "osType": "Linux",
+          "restartPolicy": "Always",
+          "ipAddress": {
+            "type": "Public",
+            "ports": [
+              {
+                "protocol": "tcp",
+                "port": "[parameters('portNumber')]"
+              }
+            ],
+            "dnsNameLabel": "[parameters('dnsLabelName')]"
+          }
+        }
+      }
+    ]
+  }
--- a/deploy/azure_resource_manager/main.bicep
+++ b/deploy/azure_resource_manager/main.bicep
@ -0,0 +1,42 @@
+param imageName string = 'ghcr.io/berriai/litellm:main-latest'
+param containerName string = 'litellm-container'
+param dnsLabelName string = 'litellm'
+param portNumber int = 4000
+
+resource containerGroupName 'Microsoft.ContainerInstance/containerGroups@2021-03-01' = {
+  name: containerName
+  location: resourceGroup().location
+  properties: {
+    containers: [
+      {
+        name: containerName
+        properties: {
+          image: imageName
+          resources: {
+            requests: {
+              cpu: 1
+              memoryInGB: 2
+            }
+          }
+          ports: [
+            {
+              port: portNumber
+            }
+          ]
+        }
+      }
+    ]
+    osType: 'Linux'
+    restartPolicy: 'Always'
+    ipAddress: {
+      type: 'Public'
+      ports: [
+        {
+          protocol: 'tcp'
+          port: portNumber
+        }
+      ]
+      dnsNameLabel: dnsLabelName
+    }
+  }
+}
--- a/docs/my-website/docs/completion/input.md
+++ b/docs/my-website/docs/completion/input.md
@ -83,6 +83,7 @@ def completion(
    top_p: Optional[float] = None,
    n: Optional[int] = None,
    stream: Optional[bool] = None,
+    stream_options: Optional[dict] = None,
    stop=None,
    max_tokens: Optional[int] = None,
    presence_penalty: Optional[float] = None,
@ -139,6 +140,10 @@ def completion(

 - `stream`: *boolean or null (optional)* - If set to true, it sends partial message deltas. Tokens will be sent as they become available, with the stream terminated by a [DONE] message.

+- `stream_options` *dict or null (optional)* - Options for streaming response. Only set this when you set `stream: true`
+
+    - `include_usage` *boolean (optional)* - If set, an additional chunk will be streamed before the data: [DONE] message. The usage field on this chunk shows the token usage statistics for the entire request, and the choices field will always be an empty array. All other chunks will also include a usage field, but with a null value. 
+
 - `stop`: *string/ array/ null (optional)* - Up to 4 sequences where the API will stop generating further tokens.

 - `max_tokens`: *integer (optional)* - The maximum number of tokens to generate in the chat completion.
--- a/docs/my-website/docs/hosted.md
+++ b/docs/my-website/docs/hosted.md
@ -47,3 +47,12 @@ Pricing is based on usage. We can figure out a price that works for your team, o
 <Image img={require('../img/litellm_hosted_ui_router.png')} />

 #### [**🚨 Schedule Call**](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
+
+## Feature List 
+
+- Easy way to add/remove models
+- 100% uptime even when models are added/removed
+- custom callback webhooks
+- your domain name with HTTPS
+- Ability to create/delete User API keys
+- Reasonable set monthly cost
--- a/docs/my-website/docs/langchain/langchain.md
+++ b/docs/my-website/docs/langchain/langchain.md
@ -14,14 +14,14 @@ import TabItem from '@theme/TabItem';

 ```python
 import os
-from langchain.chat_models import ChatLiteLLM
-from langchain.prompts.chat import (
+from langchain_community.chat_models import ChatLiteLLM
+from langchain_core.prompts import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
 )
-from langchain.schema import AIMessage, HumanMessage, SystemMessage
+from langchain_core.messages import AIMessage, HumanMessage, SystemMessage

 os.environ['OPENAI_API_KEY'] = ""
 chat = ChatLiteLLM(model="gpt-3.5-turbo")
@ -30,7 +30,7 @@ messages = [
        content="what model are you"
    )
 ]
-chat(messages)
+chat.invoke(messages)
 ```

 </TabItem>
@ -39,14 +39,14 @@ chat(messages)

 ```python
 import os
-from langchain.chat_models import ChatLiteLLM
-from langchain.prompts.chat import (
+from langchain_community.chat_models import ChatLiteLLM
+from langchain_core.prompts import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
 )
-from langchain.schema import AIMessage, HumanMessage, SystemMessage
+from langchain_core.messages import AIMessage, HumanMessage, SystemMessage

 os.environ['ANTHROPIC_API_KEY'] = ""
 chat = ChatLiteLLM(model="claude-2", temperature=0.3)
@ -55,7 +55,7 @@ messages = [
        content="what model are you"
    )
 ]
-chat(messages)
+chat.invoke(messages)
 ```

 </TabItem>
@ -64,14 +64,14 @@ chat(messages)

 ```python
 import os
-from langchain.chat_models import ChatLiteLLM
-from langchain.prompts.chat import (
+from langchain_community.chat_models import ChatLiteLLM
+from langchain_core.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
 )
-from langchain.schema import AIMessage, HumanMessage, SystemMessage
+from langchain_core.messages import AIMessage, HumanMessage, SystemMessage

 os.environ['REPLICATE_API_TOKEN'] = ""
 chat = ChatLiteLLM(model="replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1")
@ -80,7 +80,7 @@ messages = [
        content="what model are you?"
    )
 ]
-chat(messages)
+chat.invoke(messages)
 ```

 </TabItem>
@ -89,14 +89,14 @@ chat(messages)

 ```python
 import os
-from langchain.chat_models import ChatLiteLLM
-from langchain.prompts.chat import (
+from langchain_community.chat_models import ChatLiteLLM
+from langchain_core.prompts import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
 )
-from langchain.schema import AIMessage, HumanMessage, SystemMessage
+from langchain_core.messages import AIMessage, HumanMessage, SystemMessage

 os.environ['COHERE_API_KEY'] = ""
 chat = ChatLiteLLM(model="command-nightly")
@ -105,32 +105,9 @@ messages = [
        content="what model are you?"
    )
 ]
-chat(messages)
+chat.invoke(messages)
 ```

-</TabItem>
-<TabItem value="palm" label="PaLM - Google">
-
-```python
-import os
-from langchain.chat_models import ChatLiteLLM
-from langchain.prompts.chat import (
-    ChatPromptTemplate,
-    SystemMessagePromptTemplate,
-    AIMessagePromptTemplate,
-    HumanMessagePromptTemplate,
-)
-from langchain.schema import AIMessage, HumanMessage, SystemMessage
-
-os.environ['PALM_API_KEY'] = ""
-chat = ChatLiteLLM(model="palm/chat-bison")
-messages = [
-    HumanMessage(
-        content="what model are you?"
-    )
-]
-chat(messages)
-```
 </TabItem>
 </Tabs>

--- a/docs/my-website/docs/observability/langfuse_integration.md
+++ b/docs/my-website/docs/observability/langfuse_integration.md
@ -94,9 +94,10 @@ print(response)

 ```

-### Set Custom Trace ID, Trace User ID and Tags
+### Set Custom Trace ID, Trace User ID, Trace Metadata, Trace Version, Trace Release and Tags
+
+Pass `trace_id`, `trace_user_id`, `trace_metadata`, `trace_version`, `trace_release`, `tags` in `metadata`

-Pass `trace_id`, `trace_user_id` in `metadata`

 ```python
 import litellm
@ -121,12 +122,20 @@ response = completion(
  metadata={
      "generation_name": "ishaan-test-generation",  # set langfuse Generation Name
      "generation_id": "gen-id22",                  # set langfuse Generation ID 
+      "version":  "test-generation-version"         # set langfuse Generation Version
      "trace_user_id": "user-id2",                  # set langfuse Trace User ID
      "session_id": "session-1",                    # set langfuse Session ID
-      "tags": ["tag1", "tag2"]                      # set langfuse Tags
+      "tags": ["tag1", "tag2"],                     # set langfuse Tags
      "trace_id": "trace-id22",                     # set langfuse Trace ID
+      "trace_metadata": {"key": "value"},           # set langfuse Trace Metadata
+      "trace_version": "test-trace-version",        # set langfuse Trace Version (if not set, defaults to Generation Version)
+      "trace_release": "test-trace-release",        # set langfuse Trace Release
      ### OR ### 
-      "existing_trace_id": "trace-id22",                     # if generation is continuation of past trace. This prevents default behaviour of setting a trace name
+      "existing_trace_id": "trace-id22",            # if generation is continuation of past trace. This prevents default behaviour of setting a trace name
+      ### OR enforce that certain fields are trace overwritten in the trace during the continuation ###
+      "existing_trace_id": "trace-id22",
+      "trace_metadata": {"key": "updated_trace_value"},            # The new value to use for the langfuse Trace Metadata
+      "update_trace_keys": ["input", "output", "trace_metadata"],  # Updates the trace input & output to be this generations input & output also updates the Trace Metadata to match the passed in value
  },
 )

@ -134,6 +143,38 @@ print(response)

 ```

+### Trace & Generation Parameters
+
+#### Trace Specific Parameters
+
+* `trace_id`       - Identifier for the trace, must use `existing_trace_id` instead or in conjunction with `trace_id` if this is an existing trace, auto-generated by default
+* `trace_name`     - Name of the trace, auto-generated by default
+* `session_id`     - Session identifier for the trace, defaults to `None`
+* `trace_version`  - Version for the trace, defaults to value for `version`
+* `trace_release`  - Release for the trace, defaults to `None`
+* `trace_metadata` - Metadata for the trace, defaults to `None`
+* `trace_user_id`  - User identifier for the trace, defaults to completion argument `user`
+* `tags`           - Tags for the trace, defeaults to `None`
+
+##### Updatable Parameters on Continuation
+
+The following parameters can be updated on a continuation of a trace by passing in the following values into the `update_trace_keys` in the metadata of the completion.
+
+* `input`          - Will set the traces input to be the input of this latest generation
+* `output`         - Will set the traces output to be the output of this generation
+* `trace_version`  - Will set the trace version to be the provided value (To use the latest generations version instead, use `version`)
+* `trace_release`  - Will set the trace release to be the provided value
+* `trace_metadata` - Will set the trace metadata to the provided value
+* `trace_user_id`  - Will set the trace user id to the provided value
+
+#### Generation Specific Parameters
+
+* `generation_id`   - Identifier for the generation, auto-generated by default
+* `generation_name` - Identifier for the generation, auto-generated by default
+* `prompt`          - Langfuse prompt object used for the generation, defaults to None
+
+Any other key value pairs passed into the metadata not listed in the above spec for a `litellm` completion will be added as a metadata key value pair for the generation.
+
 ### Use LangChain ChatLiteLLM + Langfuse
 Pass `trace_user_id`, `session_id` in model_kwargs
 ```python
--- a/docs/my-website/docs/providers/deepseek.md
+++ b/docs/my-website/docs/providers/deepseek.md
@ -0,0 +1,54 @@
+# Deepseek
+https://deepseek.com/
+
+**We support ALL Deepseek models, just set `deepseek/` as a prefix when sending completion requests**
+
+## API Key
+```python
+# env variable
+os.environ['DEEPSEEK_API_KEY']
+```
+
+## Sample Usage
+```python
+from litellm import completion
+import os
+
+os.environ['DEEPSEEK_API_KEY'] = ""
+response = completion(
+    model="deepseek/deepseek-chat", 
+    messages=[
+       {"role": "user", "content": "hello from litellm"}
+   ],
+)
+print(response)
+```
+
+## Sample Usage - Streaming
+```python
+from litellm import completion
+import os
+
+os.environ['DEEPSEEK_API_KEY'] = ""
+response = completion(
+    model="deepseek/deepseek-chat", 
+    messages=[
+       {"role": "user", "content": "hello from litellm"}
+   ],
+    stream=True
+)
+
+for chunk in response:
+    print(chunk)
+```
+
+
+## Supported Models - ALL Deepseek Models Supported!
+We support ALL Deepseek models, just set `deepseek/` as a prefix when sending completion requests
+
+| Model Name               | Function Call                                                                                                                                                      |
+|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| deepseek-chat | `completion(model="deepseek/deepseek-chat", messages)` | 
+| deepseek-coder | `completion(model="deepseek/deepseek-chat", messages)` | 
+
+
--- a/docs/my-website/docs/providers/mistral.md
+++ b/docs/my-website/docs/providers/mistral.md
@ -44,14 +44,14 @@ for chunk in response:
 ## Supported Models
 All models listed here https://docs.mistral.ai/platform/endpoints are supported. We actively maintain the list of models, pricing, token window, etc. [here](https://github.com/BerriAI/litellm/blob/c1b25538277206b9f00de5254d80d6a83bb19a29/model_prices_and_context_window.json).

-| Model Name               | Function Call                                                                                                                                                      |
-|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| mistral-tiny | `completion(model="mistral/mistral-tiny", messages)` | 
-| mistral-small | `completion(model="mistral/mistral-small", messages)` | 
-| mistral-medium | `completion(model="mistral/mistral-medium", messages)` | 
-| mistral-large-latest | `completion(model="mistral/mistral-large-latest", messages)` | 
-| open-mixtral-8x22b | `completion(model="mistral/open-mixtral-8x22b", messages)` | 
-
+| Model Name     | Function Call                                                |
+|----------------|--------------------------------------------------------------|
+| Mistral Small  | `completion(model="mistral/mistral-small-latest", messages)` |
+| Mistral Medium | `completion(model="mistral/mistral-medium-latest", messages)`|
+| Mistral Large  | `completion(model="mistral/mistral-large-latest", messages)` |
+| Mistral 7B     | `completion(model="mistral/open-mistral-7b", messages)`      |
+| Mixtral 8x7B   | `completion(model="mistral/open-mixtral-8x7b", messages)`    |
+| Mixtral 8x22B  | `completion(model="mistral/open-mixtral-8x22b", messages)`   |

 ## Function Calling 

@ -116,6 +116,6 @@ All models listed here https://docs.mistral.ai/platform/endpoints are supported

 | Model Name               | Function Call                                                                                                                                                      |
 |--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| mistral-embed | `embedding(model="mistral/mistral-embed", input)` | 
+| Mistral Embeddings | `embedding(model="mistral/mistral-embed", input)` | 


--- a/docs/my-website/docs/proxy/token_auth.md
+++ b/docs/my-website/docs/proxy/token_auth.md
@ -17,6 +17,7 @@ This is a new feature, and subject to changes based on feedback.
 ### Step 1. Setup Proxy

 - `JWT_PUBLIC_KEY_URL`: This is the public keys endpoint of your OpenID provider. Typically it's `{openid-provider-base-url}/.well-known/openid-configuration/jwks`. For Keycloak it's `{keycloak_base_url}/realms/{your-realm}/protocol/openid-connect/certs`.
+- `JWT_AUDIENCE`: This is the audience used for decoding the JWT. If not set, the decode step will not verify the audience. 

 ```bash
 export JWT_PUBLIC_KEY_URL="" # "https://demo.duendesoftware.com/.well-known/openid-configuration/jwks"
--- a/docs/my-website/docs/proxy/users.md
+++ b/docs/my-website/docs/proxy/users.md
@ -12,8 +12,8 @@ Requirements:

 You can set budgets at 3 levels: 
 - For the proxy 
- For a user 
- For a 'user' passed to `/chat/completions`, `/embeddings` etc
+- For an internal user 
+- For an end-user
 - For a key
 - For a key (model specific budgets)

@ -58,7 +58,7 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
 }'
 ```
 </TabItem>
-<TabItem value="per-user" label="For User">
+<TabItem value="per-user" label="For Internal User">

 Apply a budget across multiple keys.

@ -165,12 +165,12 @@ curl --location 'http://localhost:4000/team/new' \
 }
 ```
 </TabItem>
-<TabItem value="per-user-chat" label="For 'user' passed to /chat/completions">
+<TabItem value="per-user-chat" label="For End User">

 Use this to budget `user` passed to `/chat/completions`, **without needing to create a key for every user**

 **Step 1. Modify config.yaml**
-Define `litellm.max_user_budget`
+Define `litellm.max_end_user_budget`
 ```yaml
 general_settings:
  master_key: sk-1234
@ -328,7 +328,7 @@ You can set:
 - max parallel requests

 <Tabs>
-<TabItem value="per-user" label="Per User">
+<TabItem value="per-user" label="Per Internal User">

 Use `/user/new`, to persist rate limits across multiple keys.

@ -408,7 +408,7 @@ curl --location 'http://localhost:4000/user/new' \
 ```


-## Create new keys for existing user
+## Create new keys for existing internal user

 Just include user_id in the `/key/generate` request.

--- a/docs/my-website/docs/routing.md
+++ b/docs/my-website/docs/routing.md
@ -96,7 +96,7 @@ print(response)
 - `router.aimage_generation()` - async image generation calls

 ## Advanced - Routing Strategies
-#### Routing Strategies - Weighted Pick, Rate Limit Aware, Least Busy, Latency Based
+#### Routing Strategies - Weighted Pick, Rate Limit Aware, Least Busy, Latency Based, Cost Based

 Router provides 4 strategies for routing your calls across multiple deployments: 

@ -467,6 +467,101 @@ async def router_acompletion():
 asyncio.run(router_acompletion())
 ```

+</TabItem>
+<TabItem value="lowest-cost" label="Lowest Cost Routing (Async)">
+
+Picks a deployment based on the lowest cost
+
+How this works:
+- Get all healthy deployments
+- Select all deployments that are under their provided `rpm/tpm` limits
+- For each deployment check if `litellm_param["model"]` exists in [`litellm_model_cost_map`](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json) 
+	- if deployment does not exist in `litellm_model_cost_map` -> use deployment_cost= `$1`
+- Select deployment with lowest cost
+
+```python
+from litellm import Router 
+import asyncio
+
+model_list =  [
+	{
+		"model_name": "gpt-3.5-turbo",
+		"litellm_params": {"model": "gpt-4"},
+		"model_info": {"id": "openai-gpt-4"},
+	},
+	{
+		"model_name": "gpt-3.5-turbo",
+		"litellm_params": {"model": "groq/llama3-8b-8192"},
+		"model_info": {"id": "groq-llama"},
+	},
+]
+
+# init router
+router = Router(model_list=model_list, routing_strategy="cost-based-routing")
+async def router_acompletion():
+	response = await router.acompletion(
+		model="gpt-3.5-turbo", 
+		messages=[{"role": "user", "content": "Hey, how's it going?"}]
+	)
+	print(response)
+
+	print(response._hidden_params["model_id"]) # expect groq-llama, since groq/llama has lowest cost
+	return response
+
+asyncio.run(router_acompletion())
+
+```
+
+
+#### Using Custom Input/Output pricing
+
+Set `litellm_params["input_cost_per_token"]` and `litellm_params["output_cost_per_token"]` for using custom pricing when routing
+
+```python
+model_list = [
+	{
+		"model_name": "gpt-3.5-turbo",
+		"litellm_params": {
+			"model": "azure/chatgpt-v-2",
+			"input_cost_per_token": 0.00003,
+			"output_cost_per_token": 0.00003,
+		},
+		"model_info": {"id": "chatgpt-v-experimental"},
+	},
+	{
+		"model_name": "gpt-3.5-turbo",
+		"litellm_params": {
+			"model": "azure/chatgpt-v-1",
+			"input_cost_per_token": 0.000000001,
+			"output_cost_per_token": 0.00000001,
+		},
+		"model_info": {"id": "chatgpt-v-1"},
+	},
+	{
+		"model_name": "gpt-3.5-turbo",
+		"litellm_params": {
+			"model": "azure/chatgpt-v-5",
+			"input_cost_per_token": 10,
+			"output_cost_per_token": 12,
+		},
+		"model_info": {"id": "chatgpt-v-5"},
+	},
+]
+# init router
+router = Router(model_list=model_list, routing_strategy="cost-based-routing")
+async def router_acompletion():
+	response = await router.acompletion(
+		model="gpt-3.5-turbo", 
+		messages=[{"role": "user", "content": "Hey, how's it going?"}]
+	)
+	print(response)
+
+	print(response._hidden_params["model_id"]) # expect chatgpt-v-1, since chatgpt-v-1 has lowest cost
+	return response
+
+asyncio.run(router_acompletion())
+```
+
 </TabItem>

 </Tabs>
@ -991,6 +1086,46 @@ async def test_acompletion_caching_on_router_caching_groups():
 asyncio.run(test_acompletion_caching_on_router_caching_groups())
 ```

+## Alerting 🚨
+
+Send alerts to slack / your webhook url for the following events
+- LLM API Exceptions
+- Slow LLM Responses
+
+Get a slack webhook url from https://api.slack.com/messaging/webhooks
+
+#### Usage
+Initialize an `AlertingConfig` and pass it to `litellm.Router`. The following code will trigger an alert because `api_key=bad-key` which is invalid
+
+```python
+from litellm.router import AlertingConfig
+import litellm
+import os
+
+router = litellm.Router(
+	model_list=[
+		{
+			"model_name": "gpt-3.5-turbo",
+			"litellm_params": {
+				"model": "gpt-3.5-turbo",
+				"api_key": "bad_key",
+			},
+		}
+	],
+	alerting_config= AlertingConfig(
+		alerting_threshold=10,                        # threshold for slow / hanging llm responses (in seconds). Defaults to 300 seconds
+		webhook_url= os.getenv("SLACK_WEBHOOK_URL")   # webhook you want to send alerts to
+	),
+)
+try:
+	await router.acompletion(
+		model="gpt-3.5-turbo",
+		messages=[{"role": "user", "content": "Hey, how's it going?"}],
+	)
+except:
+	pass
+```
+
 ## Track cost for Azure Deployments

 **Problem**: Azure returns `gpt-4` in the response when `azure/gpt-4-1106-preview` is used. This leads to inaccurate cost tracking
@ -1159,6 +1294,7 @@ def __init__(
 		"least-busy",
 		"usage-based-routing",
 		"latency-based-routing",
+		"cost-based-routing",
 	] = "simple-shuffle",

 	## DEBUGGING ##
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -134,6 +134,7 @@ const sidebars = {
        "providers/ollama", 
        "providers/perplexity", 
        "providers/groq", 
+        "providers/deepseek", 
        "providers/fireworks_ai", 
        "providers/vllm", 
        "providers/xinference", 
--- a/enterprise/utils.py
+++ b/enterprise/utils.py
@ -291,7 +291,7 @@ def _create_clickhouse_aggregate_tables(client=None, table_names=[]):


 def _forecast_daily_cost(data: list):
-    import requests
+    import requests  # type: ignore
    from datetime import datetime, timedelta

    if len(data) == 0:
--- a/litellm/init.py
+++ b/litellm/init.py
@ -361,6 +361,7 @@ openai_compatible_endpoints: List = [
    "api.deepinfra.com/v1/openai",
    "api.mistral.ai/v1",
    "api.groq.com/openai/v1",
+    "api.deepseek.com/v1",
    "api.together.xyz/v1",
 ]

@ -369,6 +370,7 @@ openai_compatible_providers: List = [
    "anyscale",
    "mistral",
    "groq",
+    "deepseek",
    "deepinfra",
    "perplexity",
    "xinference",
@ -523,6 +525,7 @@ provider_list: List = [
    "anyscale",
    "mistral",
    "groq",
+    "deepseek",
    "maritalk",
    "voyage",
    "cloudflare",
--- a/litellm/_redis.py
+++ b/litellm/_redis.py
@ -10,8 +10,8 @@
 # s/o [@Frank Colson](https://www.linkedin.com/in/frank-colson-422b9b183/) for this redis implementation
 import os
 import inspect
-import redis, litellm
-import redis.asyncio as async_redis
+import redis, litellm  # type: ignore
+import redis.asyncio as async_redis  # type: ignore
 from typing import List, Optional


--- a/litellm/budget_manager.py
+++ b/litellm/budget_manager.py
@ -10,7 +10,7 @@
 import os, json, time
 import litellm
 from litellm.utils import ModelResponse
-import requests, threading
+import requests, threading  # type: ignore
 from typing import Optional, Union, Literal


--- a/litellm/caching.py
+++ b/litellm/caching.py
@ -106,7 +106,7 @@ class InMemoryCache(BaseCache):
            return_val.append(val)
        return return_val

-    async def async_increment(self, key, value: int, **kwargs) -> int:
+    async def async_increment(self, key, value: float, **kwargs) -> float:
        # get the value
        init_value = await self.async_get_cache(key=key) or 0
        value = init_value + value
@ -423,12 +423,12 @@ class RedisCache(BaseCache):
        if len(self.redis_batch_writing_buffer) >= self.redis_flush_size:
            await self.flush_cache_buffer()  # logging done in here

-    async def async_increment(self, key, value: int, **kwargs) -> int:
+    async def async_increment(self, key, value: float, **kwargs) -> float:
        _redis_client = self.init_async_client()
        start_time = time.time()
        try:
            async with _redis_client as redis_client:
-                result = await redis_client.incr(name=key, amount=value)
+                result = await redis_client.incrbyfloat(name=key, amount=value)
                ## LOGGING ##
                end_time = time.time()
                _duration = end_time - start_time
@ -1382,18 +1382,41 @@ class DualCache(BaseCache):
            print_verbose(f"LiteLLM Cache: Excepton async add_cache: {str(e)}")
            traceback.print_exc()

+    async def async_batch_set_cache(
+        self, cache_list: list, local_only: bool = False, **kwargs
+    ):
+        """
+        Batch write values to the cache
+        """
+        print_verbose(
+            f"async batch set cache: cache keys: {cache_list}; local_only: {local_only}"
+        )
+        try:
+            if self.in_memory_cache is not None:
+                await self.in_memory_cache.async_set_cache_pipeline(
+                    cache_list=cache_list, **kwargs
+                )
+
+            if self.redis_cache is not None and local_only == False:
+                await self.redis_cache.async_set_cache_pipeline(
+                    cache_list=cache_list, ttl=kwargs.get("ttl", None)
+                )
+        except Exception as e:
+            print_verbose(f"LiteLLM Cache: Excepton async add_cache: {str(e)}")
+            traceback.print_exc()
+
    async def async_increment_cache(
-        self, key, value: int, local_only: bool = False, **kwargs
-    ) -> int:
+        self, key, value: float, local_only: bool = False, **kwargs
+    ) -> float:
        """
        Key - the key in cache

-        Value - int - the value you want to increment by
+        Value - float - the value you want to increment by

-        Returns - int - the incremented value
+        Returns - float - the incremented value
        """
        try:
-            result: int = value
+            result: float = value
            if self.in_memory_cache is not None:
                result = await self.in_memory_cache.async_increment(
                    key, value, **kwargs
--- a/litellm/integrations/aispend.py
+++ b/litellm/integrations/aispend.py
@ -1,7 +1,6 @@
 #### What this does ####
 #    On success + failure, log events to aispend.io
 import dotenv, os
-import requests

 dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
--- a/litellm/integrations/athina.py
+++ b/litellm/integrations/athina.py
@ -4,18 +4,30 @@ import datetime
 class AthinaLogger:
    def __init__(self):
        import os
+
        self.athina_api_key = os.getenv("ATHINA_API_KEY")
        self.headers = {
            "athina-api-key": self.athina_api_key,
-            "Content-Type": "application/json"
+            "Content-Type": "application/json",
        }
        self.athina_logging_url = "https://log.athina.ai/api/v1/log/inference"
-        self.additional_keys = ["environment", "prompt_slug", "customer_id", "customer_user_id", "session_id", "external_reference_id", "context", "expected_response", "user_query"]
+        self.additional_keys = [
+            "environment",
+            "prompt_slug",
+            "customer_id",
+            "customer_user_id",
+            "session_id",
+            "external_reference_id",
+            "context",
+            "expected_response",
+            "user_query",
+        ]

    def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose):
-        import requests
+        import requests  # type: ignore
        import json
        import traceback
+
        try:
            response_json = response_obj.model_dump() if response_obj else {}
            data = {
@ -23,19 +35,30 @@ class AthinaLogger:
                "request": kwargs,
                "response": response_json,
                "prompt_tokens": response_json.get("usage", {}).get("prompt_tokens"),
-                "completion_tokens": response_json.get("usage", {}).get("completion_tokens"),
+                "completion_tokens": response_json.get("usage", {}).get(
+                    "completion_tokens"
+                ),
                "total_tokens": response_json.get("usage", {}).get("total_tokens"),
            }

-            if type(end_time) == datetime.datetime and type(start_time) == datetime.datetime:
-                data["response_time"] = int((end_time - start_time).total_seconds() * 1000)
+            if (
+                type(end_time) == datetime.datetime
+                and type(start_time) == datetime.datetime
+            ):
+                data["response_time"] = int(
+                    (end_time - start_time).total_seconds() * 1000
+                )

            if "messages" in kwargs:
                data["prompt"] = kwargs.get("messages", None)

            # Directly add tools or functions if present
            optional_params = kwargs.get("optional_params", {})
-            data.update((k, v) for k, v in optional_params.items() if k in ["tools", "functions"])
+            data.update(
+                (k, v)
+                for k, v in optional_params.items()
+                if k in ["tools", "functions"]
+            )

            # Add additional metadata keys
            metadata = kwargs.get("litellm_params", {}).get("metadata", {})
@ -44,11 +67,19 @@ class AthinaLogger:
                    if key in metadata:
                        data[key] = metadata[key]

-            response = requests.post(self.athina_logging_url, headers=self.headers, data=json.dumps(data, default=str))
+            response = requests.post(
+                self.athina_logging_url,
+                headers=self.headers,
+                data=json.dumps(data, default=str),
+            )
            if response.status_code != 200:
-                print_verbose(f"Athina Logger Error - {response.text}, {response.status_code}")
+                print_verbose(
+                    f"Athina Logger Error - {response.text}, {response.status_code}"
+                )
            else:
                print_verbose(f"Athina Logger Succeeded - {response.text}")
        except Exception as e:
-            print_verbose(f"Athina Logger Error - {e}, Stack trace: {traceback.format_exc()}")
+            print_verbose(
+                f"Athina Logger Error - {e}, Stack trace: {traceback.format_exc()}"
+            )
            pass
--- a/litellm/integrations/berrispend.py
+++ b/litellm/integrations/berrispend.py
@ -1,7 +1,7 @@
 #### What this does ####
 #    On success + failure, log events to aispend.io
 import dotenv, os
-import requests
+import requests  # type: ignore

 dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
--- a/litellm/integrations/clickhouse.py
+++ b/litellm/integrations/clickhouse.py
@ -3,7 +3,6 @@
 #### What this does ####
 #    On success, logs events to Promptlayer
 import dotenv, os
-import requests

 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.caching import DualCache
--- a/litellm/integrations/custom_logger.py
+++ b/litellm/integrations/custom_logger.py
@ -1,7 +1,6 @@
 #### What this does ####
 #    On success, logs events to Promptlayer
 import dotenv, os
-import requests

 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.caching import DualCache
--- a/litellm/integrations/datadog.py
+++ b/litellm/integrations/datadog.py
@ -2,7 +2,7 @@
 #    On success + failure, log events to Supabase

 import dotenv, os
-import requests
+import requests  # type: ignore

 dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
--- a/litellm/integrations/dynamodb.py
+++ b/litellm/integrations/dynamodb.py
@ -2,7 +2,7 @@
 #    On success + failure, log events to Supabase

 import dotenv, os
-import requests
+import requests  # type: ignore

 dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
--- a/litellm/integrations/greenscale.py
+++ b/litellm/integrations/greenscale.py
@ -1,15 +1,17 @@
-import requests
+import requests  # type: ignore
 import json
 import traceback
 from datetime import datetime, timezone

+
 class GreenscaleLogger:
    def __init__(self):
        import os
+
        self.greenscale_api_key = os.getenv("GREENSCALE_API_KEY")
        self.headers = {
            "api-key": self.greenscale_api_key,
-            "Content-Type": "application/json"
+            "Content-Type": "application/json",
        }
        self.greenscale_logging_url = os.getenv("GREENSCALE_ENDPOINT")

@ -19,13 +21,18 @@ class GreenscaleLogger:
            data = {
                "modelId": kwargs.get("model"),
                "inputTokenCount": response_json.get("usage", {}).get("prompt_tokens"),
-                "outputTokenCount": response_json.get("usage", {}).get("completion_tokens"),
+                "outputTokenCount": response_json.get("usage", {}).get(
+                    "completion_tokens"
+                ),
            }
-            data["timestamp"] = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
+            data["timestamp"] = datetime.now(timezone.utc).strftime(
+                "%Y-%m-%dT%H:%M:%SZ"
+            )

            if type(end_time) == datetime and type(start_time) == datetime:
-                data["invocationLatency"] = int((end_time - start_time).total_seconds() * 1000)
-
+                data["invocationLatency"] = int(
+                    (end_time - start_time).total_seconds() * 1000
+                )

            # Add additional metadata keys to tags
            tags = []
@ -37,15 +44,25 @@ class GreenscaleLogger:
                    elif key == "greenscale_application":
                        data["application"] = value
                    else:
-                        tags.append({"key": key.replace("greenscale_", ""), "value": str(value)})
+                        tags.append(
+                            {"key": key.replace("greenscale_", ""), "value": str(value)}
+                        )

            data["tags"] = tags

-            response = requests.post(self.greenscale_logging_url, headers=self.headers, data=json.dumps(data, default=str))
+            response = requests.post(
+                self.greenscale_logging_url,
+                headers=self.headers,
+                data=json.dumps(data, default=str),
+            )
            if response.status_code != 200:
-                print_verbose(f"Greenscale Logger Error - {response.text}, {response.status_code}")
+                print_verbose(
+                    f"Greenscale Logger Error - {response.text}, {response.status_code}"
+                )
            else:
                print_verbose(f"Greenscale Logger Succeeded - {response.text}")
        except Exception as e:
-            print_verbose(f"Greenscale Logger Error - {e}, Stack trace: {traceback.format_exc()}")
+            print_verbose(
+                f"Greenscale Logger Error - {e}, Stack trace: {traceback.format_exc()}"
+            )
            pass
--- a/litellm/integrations/helicone.py
+++ b/litellm/integrations/helicone.py
@ -1,7 +1,7 @@
 #### What this does ####
 #    On success, logs events to Helicone
 import dotenv, os
-import requests
+import requests  # type: ignore
 import litellm

 dotenv.load_dotenv()  # Loading env variables using dotenv
--- a/litellm/integrations/langfuse.py
+++ b/litellm/integrations/langfuse.py
@ -262,6 +262,7 @@ class LangFuseLogger:

        try:
            tags = []
+            metadata = copy.deepcopy(metadata)  # Avoid modifying the original metadata
            supports_tags = Version(langfuse.version.__version__) >= Version("2.6.3")
            supports_prompt = Version(langfuse.version.__version__) >= Version("2.7.3")
            supports_costs = Version(langfuse.version.__version__) >= Version("2.7.3")
@ -272,36 +273,9 @@ class LangFuseLogger:
            print_verbose(f"Langfuse Layer Logging - logging to langfuse v2 ")

            if supports_tags:
-                metadata_tags = metadata.get("tags", [])
+                metadata_tags = metadata.pop("tags", [])
                tags = metadata_tags

-            trace_name = metadata.get("trace_name", None)
-            trace_id = metadata.get("trace_id", None)
-            existing_trace_id = metadata.get("existing_trace_id", None)
-            if trace_name is None and existing_trace_id is None:
-                # just log `litellm-{call_type}` as the trace name
-                ## DO NOT SET TRACE_NAME if trace-id set. this can lead to overwriting of past traces.
-                trace_name = f"litellm-{kwargs.get('call_type', 'completion')}"
-
-            if existing_trace_id is not None:
-                trace_params = {"id": existing_trace_id}
-            else:  # don't overwrite an existing trace
-                trace_params = {
-                    "name": trace_name,
-                    "input": input,
-                    "user_id": metadata.get("trace_user_id", user_id),
-                    "id": trace_id,
-                    "session_id": metadata.get("session_id", None),
-                }
-
-                if level == "ERROR":
-                    trace_params["status_message"] = output
-                else:
-                    trace_params["output"] = output
-
-            cost = kwargs.get("response_cost", None)
-            print_verbose(f"trace: {cost}")
-
            # Clean Metadata before logging - never log raw metadata
            # the raw metadata can contain circular references which leads to infinite recursion
            # we clean out all extra litellm metadata params before logging
@ -328,6 +302,66 @@ class LangFuseLogger:
                    else:
                        clean_metadata[key] = value

+            session_id = clean_metadata.pop("session_id", None)
+            trace_name = clean_metadata.pop("trace_name", None)
+            trace_id = clean_metadata.pop("trace_id", None)
+            existing_trace_id = clean_metadata.pop("existing_trace_id", None)
+            update_trace_keys = clean_metadata.pop("update_trace_keys", [])
+
+            if trace_name is None and existing_trace_id is None:
+                # just log `litellm-{call_type}` as the trace name
+                ## DO NOT SET TRACE_NAME if trace-id set. this can lead to overwriting of past traces.
+                trace_name = f"litellm-{kwargs.get('call_type', 'completion')}"
+
+            if existing_trace_id is not None:
+                trace_params = {"id": existing_trace_id}
+
+                # Update the following keys for this trace
+                for metadata_param_key in update_trace_keys:
+                    trace_param_key = metadata_param_key.replace("trace_", "")
+                    if trace_param_key not in trace_params:
+                        updated_trace_value = clean_metadata.pop(
+                            metadata_param_key, None
+                        )
+                        if updated_trace_value is not None:
+                            trace_params[trace_param_key] = updated_trace_value
+
+                # Pop the trace specific keys that would have been popped if there were a new trace
+                for key in list(
+                    filter(lambda key: key.startswith("trace_"), clean_metadata.keys())
+                ):
+                    clean_metadata.pop(key, None)
+
+                # Special keys that are found in the function arguments and not the metadata
+                if "input" in update_trace_keys:
+                    trace_params["input"] = input
+                if "output" in update_trace_keys:
+                    trace_params["output"] = output
+            else:  # don't overwrite an existing trace
+                trace_params = {
+                    "id": trace_id,
+                    "name": trace_name,
+                    "session_id": session_id,
+                    "input": input,
+                    "version": clean_metadata.pop(
+                        "trace_version", clean_metadata.get("version", None)
+                    ),  # If provided just version, it will applied to the trace as well, if applied a trace version it will take precedence
+                }
+                for key in list(
+                    filter(lambda key: key.startswith("trace_"), clean_metadata.keys())
+                ):
+                    trace_params[key.replace("trace_", "")] = clean_metadata.pop(
+                        key, None
+                    )
+
+                if level == "ERROR":
+                    trace_params["status_message"] = output
+                else:
+                    trace_params["output"] = output
+
+            cost = kwargs.get("response_cost", None)
+            print_verbose(f"trace: {cost}")
+
            if (
                litellm._langfuse_default_tags is not None
                and isinstance(litellm._langfuse_default_tags, list)
@ -387,7 +421,7 @@ class LangFuseLogger:
                    "completion_tokens": response_obj["usage"]["completion_tokens"],
                    "total_cost": cost if supports_costs else None,
                }
-            generation_name = metadata.get("generation_name", None)
+            generation_name = clean_metadata.pop("generation_name", None)
            if generation_name is None:
                # just log `litellm-{call_type}` as the generation name
                generation_name = f"litellm-{kwargs.get('call_type', 'completion')}"
@ -402,7 +436,7 @@ class LangFuseLogger:

            generation_params = {
                "name": generation_name,
-                "id": metadata.get("generation_id", generation_id),
+                "id": clean_metadata.pop("generation_id", generation_id),
                "start_time": start_time,
                "end_time": end_time,
                "model": kwargs["model"],
@ -412,10 +446,11 @@ class LangFuseLogger:
                "usage": usage,
                "metadata": clean_metadata,
                "level": level,
+                "version": clean_metadata.pop("version", None),
            }

            if supports_prompt:
-                generation_params["prompt"] = metadata.get("prompt", None)
+                generation_params["prompt"] = clean_metadata.pop("prompt", None)

            if output is not None and isinstance(output, str) and level == "ERROR":
                generation_params["status_message"] = output
--- a/litellm/integrations/langsmith.py
+++ b/litellm/integrations/langsmith.py
@ -1,15 +1,14 @@
 #### What this does ####
 #    On success, logs events to Langsmith
-import dotenv, os
-import requests
-import requests
+import dotenv, os  # type: ignore
+import requests  # type: ignore
 from datetime import datetime

 dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
 import asyncio
 import types
-from pydantic import BaseModel
+from pydantic import BaseModel  # type: ignore


 def is_serializable(value):
@ -79,8 +78,6 @@ class LangsmithLogger:
                except:
                    response_obj = response_obj.dict()  # type: ignore

-            print(f"response_obj: {response_obj}")
-
            data = {
                "name": run_name,
                "run_type": "llm",  # this should always be llm, since litellm always logs llm calls. Langsmith allow us to log "chain"
@ -90,7 +87,6 @@ class LangsmithLogger:
                "start_time": start_time,
                "end_time": end_time,
            }
-            print(f"data: {data}")

            response = requests.post(
                "https://api.smith.langchain.com/runs",
--- a/litellm/integrations/openmeter.py
+++ b/litellm/integrations/openmeter.py
@ -2,7 +2,6 @@
 ## On Success events log cost to OpenMeter - https://github.com/BerriAI/litellm/issues/1268

 import dotenv, os, json
-import requests
 import litellm

 dotenv.load_dotenv()  # Loading env variables using dotenv
@ -60,7 +59,7 @@ class OpenMeterLogger(CustomLogger):
                "total_tokens": response_obj["usage"].get("total_tokens"),
            }

-        subject = kwargs.get("user", None),  # end-user passed in via 'user' param
+        subject = (kwargs.get("user", None),)  # end-user passed in via 'user' param
        if not subject:
            raise Exception("OpenMeter: user is required")

--- a/litellm/integrations/prometheus.py
+++ b/litellm/integrations/prometheus.py
@ -3,7 +3,7 @@
 #    On success, log events to Prometheus

 import dotenv, os
-import requests
+import requests  # type: ignore

 dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
@ -19,7 +19,6 @@ class PrometheusLogger:
        **kwargs,
    ):
        try:
-            print(f"in init prometheus metrics")
            from prometheus_client import Counter

            self.litellm_llm_api_failed_requests_metric = Counter(
--- a/litellm/integrations/prometheus_services.py
+++ b/litellm/integrations/prometheus_services.py
@ -4,7 +4,7 @@


 import dotenv, os
-import requests
+import requests  # type: ignore

 dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
@ -183,7 +183,6 @@ class PrometheusServicesLogger:
                    )

    async def async_service_failure_hook(self, payload: ServiceLoggerPayload):
-        print(f"received error payload: {payload.error}")
        if self.mock_testing:
            self.mock_testing_failure_calls += 1

--- a/litellm/integrations/prompt_layer.py
+++ b/litellm/integrations/prompt_layer.py
@ -1,12 +1,13 @@
 #### What this does ####
 #    On success, logs events to Promptlayer
 import dotenv, os
-import requests
+import requests  # type: ignore
 from pydantic import BaseModel

 dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback

+
 class PromptLayerLogger:
    # Class variables or attributes
    def __init__(self):
@ -32,7 +33,11 @@ class PromptLayerLogger:
                    tags = kwargs["litellm_params"]["metadata"]["pl_tags"]

                # Remove "pl_tags" from metadata
-                metadata = {k:v for k, v in kwargs["litellm_params"]["metadata"].items() if k != "pl_tags"}
+                metadata = {
+                    k: v
+                    for k, v in kwargs["litellm_params"]["metadata"].items()
+                    if k != "pl_tags"
+                }

            print_verbose(
                f"Prompt Layer Logging - Enters logging function for model kwargs: {new_kwargs}\n, response: {response_obj}"
--- a/litellm/integrations/s3.py
+++ b/litellm/integrations/s3.py
@ -2,7 +2,6 @@
 #    On success + failure, log events to Supabase

 import dotenv, os
-import requests

 dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
--- a/litellm/integrations/slack_alerting.py
+++ b/litellm/integrations/slack_alerting.py
@ -1,25 +1,82 @@
 #### What this does ####
 #    Class for sending Slack Alerts #
 import dotenv, os
+from litellm.proxy._types import UserAPIKeyAuth

 dotenv.load_dotenv()  # Loading env variables using dotenv
-import copy
-import traceback
 from litellm._logging import verbose_logger, verbose_proxy_logger
-import litellm
+import litellm, threading
 from typing import List, Literal, Any, Union, Optional, Dict
 from litellm.caching import DualCache
 import asyncio
 import aiohttp
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
 import datetime
+from pydantic import BaseModel
+from enum import Enum
+from datetime import datetime as dt, timedelta
+from litellm.integrations.custom_logger import CustomLogger
+import random


-class SlackAlerting:
+class LiteLLMBase(BaseModel):
+    """
+    Implements default functions, all pydantic objects should have.
+    """
+
+    def json(self, **kwargs):
+        try:
+            return self.model_dump()  # noqa
+        except:
+            # if using pydantic v1
+            return self.dict()
+
+
+class SlackAlertingArgs(LiteLLMBase):
+    daily_report_frequency: int = 12 * 60 * 60  # 12 hours
+    report_check_interval: int = 5 * 60  # 5 minutes
+
+
+class DeploymentMetrics(LiteLLMBase):
+    """
+    Metrics per deployment, stored in cache
+
+    Used for daily reporting
+    """
+
+    id: str
+    """id of deployment in router model list"""
+
+    failed_request: bool
+    """did it fail the request?"""
+
+    latency_per_output_token: Optional[float]
+    """latency/output token of deployment"""
+
+    updated_at: dt
+    """Current time of deployment being updated"""
+
+
+class SlackAlertingCacheKeys(Enum):
+    """
+    Enum for deployment daily metrics keys - {deployment_id}:{enum}
+    """
+
+    failed_requests_key = "failed_requests_daily_metrics"
+    latency_key = "latency_daily_metrics"
+    report_sent_key = "daily_metrics_report_sent"
+
+
+class SlackAlerting(CustomLogger):
+    """
+    Class for sending Slack Alerts
+    """
+
    # Class variables or attributes
    def __init__(
        self,
-        alerting_threshold: float = 300,
+        internal_usage_cache: Optional[DualCache] = None,
+        alerting_threshold: float = 300,  # threshold for slow / hanging llm responses (in seconds)
        alerting: Optional[List] = [],
        alert_types: Optional[
            List[
@ -29,6 +86,7 @@ class SlackAlerting:
                    "llm_requests_hanging",
                    "budget_alerts",
                    "db_exceptions",
+                    "daily_reports",
                ]
            ]
        ] = [
@ -37,18 +95,23 @@ class SlackAlerting:
            "llm_requests_hanging",
            "budget_alerts",
            "db_exceptions",
+            "daily_reports",
        ],
        alert_to_webhook_url: Optional[
            Dict
        ] = None,  # if user wants to separate alerts to diff channels
+        alerting_args={},
+        default_webhook_url: Optional[str] = None,
    ):
        self.alerting_threshold = alerting_threshold
        self.alerting = alerting
        self.alert_types = alert_types
-        self.internal_usage_cache = DualCache()
+        self.internal_usage_cache = internal_usage_cache or DualCache()
        self.async_http_handler = AsyncHTTPHandler()
        self.alert_to_webhook_url = alert_to_webhook_url
-        pass
+        self.is_running = False
+        self.alerting_args = SlackAlertingArgs(**alerting_args)
+        self.default_webhook_url = default_webhook_url

    def update_values(
        self,
@ -56,6 +119,7 @@ class SlackAlerting:
        alerting_threshold: Optional[float] = None,
        alert_types: Optional[List] = None,
        alert_to_webhook_url: Optional[Dict] = None,
+        alerting_args: Optional[Dict] = None,
    ):
        if alerting is not None:
            self.alerting = alerting
@ -63,7 +127,8 @@ class SlackAlerting:
            self.alerting_threshold = alerting_threshold
        if alert_types is not None:
            self.alert_types = alert_types
-
+        if alerting_args is not None:
+            self.alerting_args = SlackAlertingArgs(**alerting_args)
        if alert_to_webhook_url is not None:
            # update the dict
            if self.alert_to_webhook_url is None:
@ -90,18 +155,23 @@ class SlackAlerting:

    def _add_langfuse_trace_id_to_alert(
        self,
-        request_info: str,
        request_data: Optional[dict] = None,
-        kwargs: Optional[dict] = None,
-        type: Literal["hanging_request", "slow_response"] = "hanging_request",
-        start_time: Optional[datetime.datetime] = None,
-        end_time: Optional[datetime.datetime] = None,
-    ):
+    ) -> Optional[str]:
+        """
+        Returns langfuse trace url
+        """
        # do nothing for now
-        pass
-        return request_info
+        if (
+            request_data is not None
+            and request_data.get("metadata", {}).get("trace_id", None) is not None
+        ):
+            trace_id = request_data["metadata"]["trace_id"]
+            if litellm.utils.langFuseLogger is not None:
+                base_url = litellm.utils.langFuseLogger.Langfuse.base_url
+                return f"{base_url}/trace/{trace_id}"
+        return None

-    def _response_taking_too_long_callback(
+    def _response_taking_too_long_callback_helper(
        self,
        kwargs,  # kwargs to completion
        start_time,
@ -166,7 +236,7 @@ class SlackAlerting:
            return

        time_difference_float, model, api_base, messages = (
-            self._response_taking_too_long_callback(
+            self._response_taking_too_long_callback_helper(
                kwargs=kwargs,
                start_time=start_time,
                end_time=end_time,
@ -182,6 +252,9 @@ class SlackAlerting:
                and "metadata" in kwargs["litellm_params"]
            ):
                _metadata = kwargs["litellm_params"]["metadata"]
+                request_info = litellm.utils._add_key_name_and_team_to_alert(
+                    request_info=request_info, metadata=_metadata
+                )

                _deployment_latency_map = self._get_deployment_latencies_to_alert(
                    metadata=_metadata
@ -196,8 +269,178 @@ class SlackAlerting:
                alert_type="llm_too_slow",
            )

-    async def log_failure_event(self, original_exception: Exception):
-        pass
+    async def async_update_daily_reports(
+        self, deployment_metrics: DeploymentMetrics
+    ) -> int:
+        """
+        Store the perf by deployment in cache
+        - Number of failed requests per deployment
+        - Latency / output tokens per deployment
+
+        'deployment_id:daily_metrics:failed_requests'
+        'deployment_id:daily_metrics:latency_per_output_token'
+
+        Returns
+            int - count of metrics set (1 - if just latency, 2 - if failed + latency)
+        """
+
+        return_val = 0
+        try:
+            ## FAILED REQUESTS ##
+            if deployment_metrics.failed_request:
+                await self.internal_usage_cache.async_increment_cache(
+                    key="{}:{}".format(
+                        deployment_metrics.id,
+                        SlackAlertingCacheKeys.failed_requests_key.value,
+                    ),
+                    value=1,
+                )
+
+                return_val += 1
+
+            ## LATENCY ##
+            if deployment_metrics.latency_per_output_token is not None:
+                await self.internal_usage_cache.async_increment_cache(
+                    key="{}:{}".format(
+                        deployment_metrics.id, SlackAlertingCacheKeys.latency_key.value
+                    ),
+                    value=deployment_metrics.latency_per_output_token,
+                )
+
+                return_val += 1
+
+            return return_val
+        except Exception as e:
+            return 0
+
+    async def send_daily_reports(self, router) -> bool:
+        """
+        Send a daily report on:
+        - Top 5 deployments with most failed requests
+        - Top 5 slowest deployments (normalized by latency/output tokens)
+
+        Get the value from redis cache (if available) or in-memory and send it
+
+        Cleanup:
+        - reset values in cache -> prevent memory leak
+
+        Returns:
+            True -> if successfuly sent
+            False -> if not sent
+        """
+
+        ids = router.get_model_ids()
+
+        # get keys
+        failed_request_keys = [
+            "{}:{}".format(id, SlackAlertingCacheKeys.failed_requests_key.value)
+            for id in ids
+        ]
+        latency_keys = [
+            "{}:{}".format(id, SlackAlertingCacheKeys.latency_key.value) for id in ids
+        ]
+
+        combined_metrics_keys = failed_request_keys + latency_keys  # reduce cache calls
+
+        combined_metrics_values = await self.internal_usage_cache.async_batch_get_cache(
+            keys=combined_metrics_keys
+        )  # [1, 2, None, ..]
+
+        all_none = True
+        for val in combined_metrics_values:
+            if val is not None:
+                all_none = False
+
+        if all_none:
+            return False
+
+        failed_request_values = combined_metrics_values[
+            : len(failed_request_keys)
+        ]  # # [1, 2, None, ..]
+        latency_values = combined_metrics_values[len(failed_request_keys) :]
+
+        # find top 5 failed
+        ## Replace None values with a placeholder value (-1 in this case)
+        placeholder_value = 0
+        replaced_failed_values = [
+            value if value is not None else placeholder_value
+            for value in failed_request_values
+        ]
+
+        ## Get the indices of top 5 keys with the highest numerical values (ignoring None values)
+        top_5_failed = sorted(
+            range(len(replaced_failed_values)),
+            key=lambda i: replaced_failed_values[i],
+            reverse=True,
+        )[:5]
+
+        # find top 5 slowest
+        # Replace None values with a placeholder value (-1 in this case)
+        placeholder_value = 0
+        replaced_slowest_values = [
+            value if value is not None else placeholder_value
+            for value in latency_values
+        ]
+
+        # Get the indices of top 5 values with the highest numerical values (ignoring None values)
+        top_5_slowest = sorted(
+            range(len(replaced_slowest_values)),
+            key=lambda i: replaced_slowest_values[i],
+            reverse=True,
+        )[:5]
+
+        # format alert -> return the litellm model name + api base
+        message = f"\n\nHere are today's key metrics 📈: \n\n"
+
+        message += "\n\n*❗️ Top 5 Deployments with Most Failed Requests:*\n\n"
+        for i in range(len(top_5_failed)):
+            key = failed_request_keys[top_5_failed[i]].split(":")[0]
+            _deployment = router.get_model_info(key)
+            if isinstance(_deployment, dict):
+                deployment_name = _deployment["litellm_params"].get("model", "")
+            else:
+                return False
+
+            api_base = litellm.get_api_base(
+                model=deployment_name,
+                optional_params=(
+                    _deployment["litellm_params"] if _deployment is not None else {}
+                ),
+            )
+            if api_base is None:
+                api_base = ""
+            value = replaced_failed_values[top_5_failed[i]]
+            message += f"\t{i+1}. Deployment: `{deployment_name}`, Failed Requests: `{value}`,  API Base: `{api_base}`\n"
+
+        message += "\n\n*😅 Top 5 Slowest Deployments:*\n\n"
+        for i in range(len(top_5_slowest)):
+            key = latency_keys[top_5_slowest[i]].split(":")[0]
+            _deployment = router.get_model_info(key)
+            if _deployment is not None:
+                deployment_name = _deployment["litellm_params"].get("model", "")
+            else:
+                deployment_name = ""
+            api_base = litellm.get_api_base(
+                model=deployment_name,
+                optional_params=(
+                    _deployment["litellm_params"] if _deployment is not None else {}
+                ),
+            )
+            value = round(replaced_slowest_values[top_5_slowest[i]], 3)
+            message += f"\t{i+1}. Deployment: `{deployment_name}`, Latency per output token: `{value}s/token`,  API Base: `{api_base}`\n\n"
+
+        # cache cleanup -> reset values to 0
+        latency_cache_keys = [(key, 0) for key in latency_keys]
+        failed_request_cache_keys = [(key, 0) for key in failed_request_keys]
+        combined_metrics_cache_keys = latency_cache_keys + failed_request_cache_keys
+        await self.internal_usage_cache.async_batch_set_cache(
+            cache_list=combined_metrics_cache_keys
+        )
+
+        # send alert
+        await self.send_alert(message=message, level="Low", alert_type="daily_reports")
+
+        return True

    async def response_taking_too_long(
        self,
@ -255,6 +498,11 @@ class SlackAlerting:
                    # in that case we fallback to the api base set in the request metadata
                    _metadata = request_data["metadata"]
                    _api_base = _metadata.get("api_base", "")
+
+                    request_info = litellm.utils._add_key_name_and_team_to_alert(
+                        request_info=request_info, metadata=_metadata
+                    )
+
                    if _api_base is None:
                        _api_base = ""
                    request_info += f"\nAPI Base: `{_api_base}`"
@ -264,14 +512,13 @@ class SlackAlerting:
                )

                if "langfuse" in litellm.success_callback:
-                    request_info = self._add_langfuse_trace_id_to_alert(
-                        request_info=request_info,
+                    langfuse_url = self._add_langfuse_trace_id_to_alert(
                        request_data=request_data,
-                        type="hanging_request",
-                        start_time=start_time,
-                        end_time=end_time,
                    )

+                    if langfuse_url is not None:
+                        request_info += "\n🪢 Langfuse Trace: {}".format(langfuse_url)
+
                # add deployment latencies to alert
                _deployment_latency_map = self._get_deployment_latencies_to_alert(
                    metadata=request_data.get("metadata", {})
@ -404,6 +651,53 @@ class SlackAlerting:

        return

+    async def model_added_alert(self, model_name: str, litellm_model_name: str):
+        model_info = litellm.model_cost.get(litellm_model_name, {})
+        model_info_str = ""
+        for k, v in model_info.items():
+            if k == "input_cost_per_token" or k == "output_cost_per_token":
+                # when converting to string it should not be 1.63e-06
+                v = "{:.8f}".format(v)
+
+            model_info_str += f"{k}: {v}\n"
+
+        message = f"""
+*🚅 New Model Added*
+Model Name: `{model_name}`
+
+Usage OpenAI Python SDK:
+```
+import openai
+client = openai.OpenAI(
+    api_key="your_api_key",
+    base_url={os.getenv("PROXY_BASE_URL", "http://0.0.0.0:4000")}
+)
+
+response = client.chat.completions.create(
+    model="{model_name}", # model to send to the proxy
+    messages = [
+        {{
+            "role": "user",
+            "content": "this is a test request, write a short poem"
+        }}
+    ]
+)
+```
+
+Model Info: 
+```
+{model_info_str}
+```
+"""
+
+        await self.send_alert(
+            message=message, level="Low", alert_type="new_model_added"
+        )
+        pass
+
+    async def model_removed_alert(self, model_name: str):
+        pass
+
    async def send_alert(
        self,
        message: str,
@ -414,7 +708,11 @@ class SlackAlerting:
            "llm_requests_hanging",
            "budget_alerts",
            "db_exceptions",
+            "daily_reports",
+            "new_model_added",
+            "cooldown_deployment",
        ],
+        **kwargs,
    ):
        """
        Alerting based on thresholds: - https://github.com/BerriAI/litellm/issues/1298
@ -439,9 +737,16 @@ class SlackAlerting:
        # Get the current timestamp
        current_time = datetime.now().strftime("%H:%M:%S")
        _proxy_base_url = os.getenv("PROXY_BASE_URL", None)
-        formatted_message = (
-            f"Level: `{level}`\nTimestamp: `{current_time}`\n\nMessage: {message}"
-        )
+        if alert_type == "daily_reports" or alert_type == "new_model_added":
+            formatted_message = message
+        else:
+            formatted_message = (
+                f"Level: `{level}`\nTimestamp: `{current_time}`\n\nMessage: {message}"
+            )
+
+        if kwargs:
+            for key, value in kwargs.items():
+                formatted_message += f"\n\n{key}: `{value}`\n\n"
        if _proxy_base_url is not None:
            formatted_message += f"\n\nProxy URL: `{_proxy_base_url}`"

@ -451,6 +756,8 @@ class SlackAlerting:
            and alert_type in self.alert_to_webhook_url
        ):
            slack_webhook_url = self.alert_to_webhook_url[alert_type]
+        elif self.default_webhook_url is not None:
+            slack_webhook_url = self.default_webhook_url
        else:
            slack_webhook_url = os.getenv("SLACK_WEBHOOK_URL", None)

@ -468,3 +775,113 @@ class SlackAlerting:
            pass
        else:
            print("Error sending slack alert. Error=", response.text)  # noqa
+
+    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
+        """Log deployment latency"""
+        if "daily_reports" in self.alert_types:
+            model_id = (
+                kwargs.get("litellm_params", {}).get("model_info", {}).get("id", "")
+            )
+            response_s: timedelta = end_time - start_time
+
+            final_value = response_s
+            total_tokens = 0
+
+            if isinstance(response_obj, litellm.ModelResponse):
+                completion_tokens = response_obj.usage.completion_tokens
+                final_value = float(response_s.total_seconds() / completion_tokens)
+
+            await self.async_update_daily_reports(
+                DeploymentMetrics(
+                    id=model_id,
+                    failed_request=False,
+                    latency_per_output_token=final_value,
+                    updated_at=litellm.utils.get_utc_datetime(),
+                )
+            )
+
+    async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
+        """Log failure + deployment latency"""
+        if "daily_reports" in self.alert_types:
+            model_id = (
+                kwargs.get("litellm_params", {}).get("model_info", {}).get("id", "")
+            )
+            await self.async_update_daily_reports(
+                DeploymentMetrics(
+                    id=model_id,
+                    failed_request=True,
+                    latency_per_output_token=None,
+                    updated_at=litellm.utils.get_utc_datetime(),
+                )
+            )
+        if "llm_exceptions" in self.alert_types:
+            original_exception = kwargs.get("exception", None)
+
+            await self.send_alert(
+                message="LLM API Failure - " + str(original_exception),
+                level="High",
+                alert_type="llm_exceptions",
+            )
+
+    async def _run_scheduler_helper(self, llm_router) -> bool:
+        """
+        Returns:
+        - True -> report sent
+        - False -> report not sent
+        """
+        report_sent_bool = False
+
+        report_sent = await self.internal_usage_cache.async_get_cache(
+            key=SlackAlertingCacheKeys.report_sent_key.value
+        )  # None | datetime
+
+        current_time = litellm.utils.get_utc_datetime()
+
+        if report_sent is None:
+            _current_time = current_time.isoformat()
+            await self.internal_usage_cache.async_set_cache(
+                key=SlackAlertingCacheKeys.report_sent_key.value,
+                value=_current_time,
+            )
+        else:
+            # check if current time - interval >= time last sent
+            delta = current_time - timedelta(
+                seconds=self.alerting_args.daily_report_frequency
+            )
+
+            if isinstance(report_sent, str):
+                report_sent = dt.fromisoformat(report_sent)
+
+            if delta >= report_sent:
+                # Sneak in the reporting logic here
+                await self.send_daily_reports(router=llm_router)
+                # Also, don't forget to update the report_sent time after sending the report!
+                _current_time = current_time.isoformat()
+                await self.internal_usage_cache.async_set_cache(
+                    key=SlackAlertingCacheKeys.report_sent_key.value,
+                    value=_current_time,
+                )
+                report_sent_bool = True
+
+        return report_sent_bool
+
+    async def _run_scheduled_daily_report(self, llm_router: Optional[Any] = None):
+        """
+        If 'daily_reports' enabled
+
+        Ping redis cache every 5 minutes to check if we should send the report
+
+        If yes -> call send_daily_report()
+        """
+        if llm_router is None or self.alert_types is None:
+            return
+
+        if "daily_reports" in self.alert_types:
+            while True:
+                await self._run_scheduler_helper(llm_router=llm_router)
+                interval = random.randint(
+                    self.alerting_args.report_check_interval - 3,
+                    self.alerting_args.report_check_interval + 3,
+                )  # shuffle to prevent collisions
+                await asyncio.sleep(interval)
+        return
--- a/litellm/integrations/supabase.py
+++ b/litellm/integrations/supabase.py
@ -2,7 +2,7 @@
 #    On success + failure, log events to Supabase

 import dotenv, os
-import requests
+import requests  # type: ignore

 dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
--- a/litellm/llms/ai21.py
+++ b/litellm/llms/ai21.py
@ -1,8 +1,8 @@
 import os, types, traceback
 import json
 from enum import Enum
-import requests
-import time, httpx
+import requests  # type: ignore
+import time, httpx  # type: ignore
 from typing import Callable, Optional
 from litellm.utils import ModelResponse, Choices, Message
 import litellm
--- a/litellm/llms/aleph_alpha.py
+++ b/litellm/llms/aleph_alpha.py
@ -1,12 +1,12 @@
 import os, types
 import json
 from enum import Enum
-import requests
+import requests  # type: ignore
 import time
 from typing import Callable, Optional
 import litellm
 from litellm.utils import ModelResponse, Choices, Message, Usage
-import httpx
+import httpx  # type: ignore


 class AlephAlphaError(Exception):
--- a/litellm/llms/anthropic.py
+++ b/litellm/llms/anthropic.py
@ -1,7 +1,7 @@
 import os, types
 import json
 from enum import Enum
-import requests, copy
+import requests, copy  # type: ignore
 import time
 from typing import Callable, Optional, List
 from litellm.utils import ModelResponse, Usage, map_finish_reason, CustomStreamWrapper
@ -9,7 +9,7 @@ import litellm
 from .prompt_templates.factory import prompt_factory, custom_prompt
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
 from .base import BaseLLM
-import httpx
+import httpx  # type: ignore


 class AnthropicConstants(Enum):
@ -184,11 +184,6 @@ class AnthropicChatCompletion(BaseLLM):
                message=str(completion_response["error"]),
                status_code=response.status_code,
            )
-        elif len(completion_response["content"]) == 0:
-            raise AnthropicError(
-                message="No content in response",
-                status_code=500,
-            )
        else:
            text_content = ""
            tool_calls = []
--- a/litellm/llms/azure.py
+++ b/litellm/llms/azure.py
@ -1,4 +1,4 @@
-from typing import Optional, Union, Any
+from typing import Optional, Union, Any, Literal
 import types, requests
 from .base import BaseLLM
 from litellm.utils import (
@ -12,7 +12,7 @@ from litellm.utils import (
 from typing import Callable, Optional, BinaryIO
 from litellm import OpenAIConfig
 import litellm, json
-import httpx
+import httpx  # type: ignore
 from .custom_httpx.azure_dall_e_2 import CustomHTTPTransport, AsyncCustomHTTPTransport
 from openai import AzureOpenAI, AsyncAzureOpenAI
 import uuid
@ -952,6 +952,81 @@ class AzureChatCompletion(BaseLLM):
            )
            raise e

+    def get_headers(
+        self,
+        model: Optional[str],
+        api_key: str,
+        api_base: str,
+        api_version: str,
+        timeout: float,
+        mode: str,
+        messages: Optional[list] = None,
+        input: Optional[list] = None,
+        prompt: Optional[str] = None,
+    ) -> dict:
+        client_session = litellm.client_session or httpx.Client(
+            transport=CustomHTTPTransport(),  # handle dall-e-2 calls
+        )
+        if "gateway.ai.cloudflare.com" in api_base:
+            ## build base url - assume api base includes resource name
+            if not api_base.endswith("/"):
+                api_base += "/"
+            api_base += f"{model}"
+            client = AzureOpenAI(
+                base_url=api_base,
+                api_version=api_version,
+                api_key=api_key,
+                timeout=timeout,
+                http_client=client_session,
+            )
+            model = None
+            # cloudflare ai gateway, needs model=None
+        else:
+            client = AzureOpenAI(
+                api_version=api_version,
+                azure_endpoint=api_base,
+                api_key=api_key,
+                timeout=timeout,
+                http_client=client_session,
+            )
+
+            # only run this check if it's not cloudflare ai gateway
+            if model is None and mode != "image_generation":
+                raise Exception("model is not set")
+
+        completion = None
+
+        if messages is None:
+            messages = [{"role": "user", "content": "Hey"}]
+        try:
+            completion = client.chat.completions.with_raw_response.create(
+                model=model,  # type: ignore
+                messages=messages,  # type: ignore
+            )
+        except Exception as e:
+            raise e
+        response = {}
+
+        if completion is None or not hasattr(completion, "headers"):
+            raise Exception("invalid completion response")
+
+        if (
+            completion.headers.get("x-ratelimit-remaining-requests", None) is not None
+        ):  # not provided for dall-e requests
+            response["x-ratelimit-remaining-requests"] = completion.headers[
+                "x-ratelimit-remaining-requests"
+            ]
+
+        if completion.headers.get("x-ratelimit-remaining-tokens", None) is not None:
+            response["x-ratelimit-remaining-tokens"] = completion.headers[
+                "x-ratelimit-remaining-tokens"
+            ]
+
+        if completion.headers.get("x-ms-region", None) is not None:
+            response["x-ms-region"] = completion.headers["x-ms-region"]
+
+        return response
+
    async def ahealth_check(
        self,
        model: Optional[str],
@ -963,7 +1038,7 @@ class AzureChatCompletion(BaseLLM):
        messages: Optional[list] = None,
        input: Optional[list] = None,
        prompt: Optional[str] = None,
-    ):
+    ) -> dict:
        client_session = litellm.aclient_session or httpx.AsyncClient(
            transport=AsyncCustomHTTPTransport(),  # handle dall-e-2 calls
        )
@ -1040,4 +1115,8 @@ class AzureChatCompletion(BaseLLM):
            response["x-ratelimit-remaining-tokens"] = completion.headers[
                "x-ratelimit-remaining-tokens"
            ]
+
+        if completion.headers.get("x-ms-region", None) is not None:
+            response["x-ms-region"] = completion.headers["x-ms-region"]
+
        return response
--- a/litellm/llms/azure_text.py
+++ b/litellm/llms/azure_text.py
@ -1,5 +1,5 @@
 from typing import Optional, Union, Any
-import types, requests
+import types, requests  # type: ignore
 from .base import BaseLLM
 from litellm.utils import (
    ModelResponse,
--- a/litellm/llms/baseten.py
+++ b/litellm/llms/baseten.py
@ -1,7 +1,7 @@
 import os
 import json
 from enum import Enum
-import requests
+import requests  # type: ignore
 import time
 from typing import Callable
 from litellm.utils import ModelResponse, Usage
--- a/litellm/llms/bedrock.py
+++ b/litellm/llms/bedrock.py
@ -163,10 +163,9 @@ class AmazonAnthropicClaude3Config:
            "stop",
            "temperature",
            "top_p",
-            "extra_headers"
+            "extra_headers",
        ]

-
    def map_openai_params(self, non_default_params: dict, optional_params: dict):
        for param, value in non_default_params.items():
            if param == "max_tokens":
@ -534,10 +533,12 @@ class AmazonStabilityConfig:

 def add_custom_header(headers):
    """Closure to capture the headers and add them."""
+
    def callback(request, **kwargs):
        """Actual callback function that Boto3 will call."""
        for header_name, header_value in headers.items():
            request.headers.add_header(header_name, header_value)
+
    return callback


@ -672,7 +673,9 @@ def init_bedrock_client(
            config=config,
        )
    if extra_headers:
-        client.meta.events.register('before-sign.bedrock-runtime.*', add_custom_header(extra_headers))
+        client.meta.events.register(
+            "before-sign.bedrock-runtime.*", add_custom_header(extra_headers)
+        )

    return client

@ -1224,7 +1227,7 @@ def _embedding_func_single(
            "input_type", "search_document"
        )  # aws bedrock example default - https://us-east-1.console.aws.amazon.com/bedrock/home?region=us-east-1#/providers?model=cohere.embed-english-v3
        data = {"texts": [input], **inference_params}  # type: ignore
-    body = json.dumps(data).encode("utf-8")
+    body = json.dumps(data).encode("utf-8")  # type: ignore
    ## LOGGING
    request_str = f"""
    response = client.invoke_model(
@ -1416,7 +1419,7 @@ def image_generation(
    ## LOGGING
    request_str = f"""
    response = client.invoke_model(
-        body={body},
+        body={body}, # type: ignore
        modelId={modelId},
        accept="application/json",
        contentType="application/json",
--- a/litellm/llms/cloudflare.py
+++ b/litellm/llms/cloudflare.py
@ -1,11 +1,11 @@
 import os, types
 import json
 from enum import Enum
-import requests
+import requests  # type: ignore
 import time
 from typing import Callable, Optional
 import litellm
-import httpx
+import httpx  # type: ignore
 from litellm.utils import ModelResponse, Usage
 from .prompt_templates.factory import prompt_factory, custom_prompt

--- a/litellm/llms/cohere.py
+++ b/litellm/llms/cohere.py
@ -1,12 +1,12 @@
 import os, types
 import json
 from enum import Enum
-import requests
+import requests  # type: ignore
 import time, traceback
 from typing import Callable, Optional
 from litellm.utils import ModelResponse, Choices, Message, Usage
 import litellm
-import httpx
+import httpx  # type: ignore


 class CohereError(Exception):
--- a/litellm/llms/cohere_chat.py
+++ b/litellm/llms/cohere_chat.py
@ -1,12 +1,12 @@
 import os, types
 import json
 from enum import Enum
-import requests
+import requests  # type: ignore
 import time, traceback
 from typing import Callable, Optional
 from litellm.utils import ModelResponse, Choices, Message, Usage
 import litellm
-import httpx
+import httpx  # type: ignore
 from .prompt_templates.factory import cohere_message_pt


--- a/litellm/llms/maritalk.py
+++ b/litellm/llms/maritalk.py
@ -1,7 +1,7 @@
 import os, types
 import json
 from enum import Enum
-import requests
+import requests  # type: ignore
 import time, traceback
 from typing import Callable, Optional, List
 from litellm.utils import ModelResponse, Choices, Message, Usage
--- a/litellm/llms/nlp_cloud.py
+++ b/litellm/llms/nlp_cloud.py
@ -1,7 +1,7 @@
 import os, types
 import json
 from enum import Enum
-import requests
+import requests  # type: ignore
 import time
 from typing import Callable, Optional
 import litellm
--- a/litellm/llms/ollama.py
+++ b/litellm/llms/ollama.py
@ -1,10 +1,10 @@
 from itertools import chain
-import requests, types, time
+import requests, types, time  # type: ignore
 import json, uuid
 import traceback
 from typing import Optional
 import litellm
-import httpx, aiohttp, asyncio
+import httpx, aiohttp, asyncio  # type: ignore
 from .prompt_templates.factory import prompt_factory, custom_prompt


@ -220,7 +220,10 @@ def get_ollama_response(
            tool_calls=[
                {
                    "id": f"call_{str(uuid.uuid4())}",
-                    "function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])},
+                    "function": {
+                        "name": function_call["name"],
+                        "arguments": json.dumps(function_call["arguments"]),
+                    },
                    "type": "function",
                }
            ],
@ -232,7 +235,9 @@ def get_ollama_response(
    model_response["created"] = int(time.time())
    model_response["model"] = "ollama/" + model
    prompt_tokens = response_json.get("prompt_eval_count", len(encoding.encode(prompt, disallowed_special=())))  # type: ignore
-    completion_tokens = response_json.get("eval_count", len(response_json.get("message",dict()).get("content", "")))
+    completion_tokens = response_json.get(
+        "eval_count", len(response_json.get("message", dict()).get("content", ""))
+    )
    model_response["usage"] = litellm.Usage(
        prompt_tokens=prompt_tokens,
        completion_tokens=completion_tokens,
@ -273,7 +278,10 @@ def ollama_completion_stream(url, data, logging_obj):
                    tool_calls=[
                        {
                            "id": f"call_{str(uuid.uuid4())}",
-                            "function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])},
+                            "function": {
+                                "name": function_call["name"],
+                                "arguments": json.dumps(function_call["arguments"]),
+                            },
                            "type": "function",
                        }
                    ],
@ -314,9 +322,10 @@ async def ollama_async_streaming(url, data, model_response, encoding, logging_ob
                first_chunk_content = first_chunk.choices[0].delta.content or ""
                response_content = first_chunk_content + "".join(
                    [
-                    chunk.choices[0].delta.content
-                    async for chunk in streamwrapper
-                    if chunk.choices[0].delta.content]
+                        chunk.choices[0].delta.content
+                        async for chunk in streamwrapper
+                        if chunk.choices[0].delta.content
+                    ]
                )
                function_call = json.loads(response_content)
                delta = litellm.utils.Delta(
@ -324,7 +333,10 @@ async def ollama_async_streaming(url, data, model_response, encoding, logging_ob
                    tool_calls=[
                        {
                            "id": f"call_{str(uuid.uuid4())}",
-                            "function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])},
+                            "function": {
+                                "name": function_call["name"],
+                                "arguments": json.dumps(function_call["arguments"]),
+                            },
                            "type": "function",
                        }
                    ],
@ -373,7 +385,10 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
                    tool_calls=[
                        {
                            "id": f"call_{str(uuid.uuid4())}",
-                            "function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])},
+                            "function": {
+                                "name": function_call["name"],
+                                "arguments": json.dumps(function_call["arguments"]),
+                            },
                            "type": "function",
                        }
                    ],
@ -387,7 +402,10 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
            model_response["created"] = int(time.time())
            model_response["model"] = "ollama/" + data["model"]
            prompt_tokens = response_json.get("prompt_eval_count", len(encoding.encode(data["prompt"], disallowed_special=())))  # type: ignore
-            completion_tokens = response_json.get("eval_count", len(response_json.get("message",dict()).get("content", "")))
+            completion_tokens = response_json.get(
+                "eval_count",
+                len(response_json.get("message", dict()).get("content", "")),
+            )
            model_response["usage"] = litellm.Usage(
                prompt_tokens=prompt_tokens,
                completion_tokens=completion_tokens,
@ -474,3 +492,25 @@ async def ollama_aembeddings(
        "total_tokens": total_input_tokens,
    }
    return model_response
+
+
+def ollama_embeddings(
+    api_base: str,
+    model: str,
+    prompts: list,
+    optional_params=None,
+    logging_obj=None,
+    model_response=None,
+    encoding=None,
+):
+    return asyncio.run(
+        ollama_aembeddings(
+            api_base,
+            model,
+            prompts,
+            optional_params,
+            logging_obj,
+            model_response,
+            encoding,
+        )
+    )
--- a/litellm/llms/oobabooga.py
+++ b/litellm/llms/oobabooga.py
@ -1,7 +1,7 @@
 import os
 import json
 from enum import Enum
-import requests
+import requests  # type: ignore
 import time
 from typing import Callable, Optional
 from litellm.utils import ModelResponse, Usage
--- a/litellm/llms/openai.py
+++ b/litellm/llms/openai.py
@ -22,7 +22,6 @@ from litellm.utils import (
    TextCompletionResponse,
 )
 from typing import Callable, Optional
-import aiohttp, requests
 import litellm
 from .prompt_templates.factory import prompt_factory, custom_prompt
 from openai import OpenAI, AsyncOpenAI
@ -531,6 +530,7 @@ class OpenAIChatCompletion(BaseLLM):
            model=model,
            custom_llm_provider="openai",
            logging_obj=logging_obj,
+            stream_options=data.get("stream_options", None),
        )
        return streamwrapper

@ -580,6 +580,7 @@ class OpenAIChatCompletion(BaseLLM):
                model=model,
                custom_llm_provider="openai",
                logging_obj=logging_obj,
+                stream_options=data.get("stream_options", None),
            )
            return streamwrapper
        except (
--- a/litellm/llms/petals.py
+++ b/litellm/llms/petals.py
@ -1,7 +1,7 @@
 import os, types
 import json
 from enum import Enum
-import requests
+import requests  # type: ignore
 import time
 from typing import Callable, Optional
 import litellm
--- a/litellm/llms/prompt_templates/factory.py
+++ b/litellm/llms/prompt_templates/factory.py
@ -981,7 +981,7 @@ def anthropic_messages_pt(messages: list):
    # add role=tool support to allow function call result/error submission
    user_message_types = {"user", "tool", "function"}
    # reformat messages to ensure user/assistant are alternating, if there's either 2 consecutive 'user' messages or 2 consecutive 'assistant' message, merge them.
-    new_messages = []
+    new_messages: list = []
    msg_i = 0
    tool_use_param = False
    while msg_i < len(messages):
--- a/litellm/llms/replicate.py
+++ b/litellm/llms/replicate.py
@ -1,11 +1,11 @@
 import os, types
 import json
-import requests
+import requests  # type: ignore
 import time
 from typing import Callable, Optional
 from litellm.utils import ModelResponse, Usage
 import litellm
-import httpx
+import httpx  # type: ignore
 from .prompt_templates.factory import prompt_factory, custom_prompt


--- a/litellm/llms/sagemaker.py
+++ b/litellm/llms/sagemaker.py
@ -1,14 +1,14 @@
 import os, types, traceback
 from enum import Enum
 import json
-import requests
+import requests  # type: ignore
 import time
 from typing import Callable, Optional, Any
 import litellm
 from litellm.utils import ModelResponse, EmbeddingResponse, get_secret, Usage
 import sys
 from copy import deepcopy
-import httpx
+import httpx  # type: ignore
 from .prompt_templates.factory import prompt_factory, custom_prompt


@ -295,7 +295,7 @@ def completion(
                EndpointName={model},
                InferenceComponentName={model_id},
                ContentType="application/json",
-                Body={data},
+                Body={data}, # type: ignore
                CustomAttributes="accept_eula=true",
            )
            """  # type: ignore
@ -321,7 +321,7 @@ def completion(
            response = client.invoke_endpoint(
                EndpointName={model},
                ContentType="application/json",
-                Body={data},
+                Body={data}, # type: ignore
                CustomAttributes="accept_eula=true",
            )
            """  # type: ignore
@ -688,7 +688,7 @@ def embedding(
    response = client.invoke_endpoint(
        EndpointName={model},
        ContentType="application/json",
-        Body={data},
+        Body={data}, # type: ignore
        CustomAttributes="accept_eula=true",
    )"""  # type: ignore
    logging_obj.pre_call(
--- a/litellm/llms/together_ai.py
+++ b/litellm/llms/together_ai.py
@ -6,11 +6,11 @@ Reference: https://docs.together.ai/docs/openai-api-compatibility
 import os, types
 import json
 from enum import Enum
-import requests
+import requests  # type: ignore
 import time
 from typing import Callable, Optional
 import litellm
-import httpx
+import httpx  # type: ignore
 from litellm.utils import ModelResponse, Usage
 from .prompt_templates.factory import prompt_factory, custom_prompt

--- a/litellm/llms/vertex_ai.py
+++ b/litellm/llms/vertex_ai.py
@ -1,12 +1,12 @@
 import os, types
 import json
 from enum import Enum
-import requests
+import requests  # type: ignore
 import time
 from typing import Callable, Optional, Union, List
 from litellm.utils import ModelResponse, Usage, CustomStreamWrapper, map_finish_reason
 import litellm, uuid
-import httpx, inspect
+import httpx, inspect  # type: ignore


 class VertexAIError(Exception):
--- a/litellm/llms/vertex_ai_anthropic.py
+++ b/litellm/llms/vertex_ai_anthropic.py
@ -3,7 +3,7 @@
 import os, types
 import json
 from enum import Enum
-import requests, copy
+import requests, copy  # type: ignore
 import time, uuid
 from typing import Callable, Optional, List
 from litellm.utils import ModelResponse, Usage, map_finish_reason, CustomStreamWrapper
@ -17,7 +17,7 @@ from .prompt_templates.factory import (
    extract_between_tags,
    parse_xml_params,
 )
-import httpx
+import httpx  # type: ignore


 class VertexAIError(Exception):
--- a/litellm/llms/vllm.py
+++ b/litellm/llms/vllm.py
@ -1,8 +1,8 @@
 import os
 import json
 from enum import Enum
-import requests
-import time, httpx
+import requests  # type: ignore
+import time, httpx  # type: ignore
 from typing import Callable, Any
 from litellm.utils import ModelResponse, Usage
 from .prompt_templates.factory import prompt_factory, custom_prompt
--- a/litellm/llms/watsonx.py
+++ b/litellm/llms/watsonx.py
@ -3,8 +3,8 @@ import json, types, time  # noqa: E401
 from contextlib import contextmanager
 from typing import Callable, Dict, Optional, Any, Union, List

-import httpx
-import requests
+import httpx  # type: ignore
+import requests  # type: ignore
 import litellm
 from litellm.utils import ModelResponse, get_secret, Usage

--- a/litellm/main.py
+++ b/litellm/main.py
@ -12,9 +12,9 @@ from typing import Any, Literal, Union, BinaryIO
 from functools import partial
 import dotenv, traceback, random, asyncio, time, contextvars
 from copy import deepcopy
-
 import httpx
 import litellm
+
 from ._logging import verbose_logger
 from litellm import (  # type: ignore
    client,
@ -188,6 +188,7 @@ async def acompletion(
    top_p: Optional[float] = None,
    n: Optional[int] = None,
    stream: Optional[bool] = None,
+    stream_options: Optional[dict] = None,
    stop=None,
    max_tokens: Optional[int] = None,
    presence_penalty: Optional[float] = None,
@ -207,6 +208,7 @@ async def acompletion(
    api_version: Optional[str] = None,
    api_key: Optional[str] = None,
    model_list: Optional[list] = None,  # pass in a list of api_base,keys, etc.
+    extra_headers: Optional[dict] = None,
    # Optional liteLLM function params
    **kwargs,
 ):
@ -224,6 +226,7 @@ async def acompletion(
        top_p (float, optional): The top-p parameter for nucleus sampling (default is 1.0).
        n (int, optional): The number of completions to generate (default is 1).
        stream (bool, optional): If True, return a streaming response (default is False).
+        stream_options (dict, optional): A dictionary containing options for the streaming response. Only use this if stream is True.
        stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens.
        max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity).
        presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far.
@ -261,6 +264,7 @@ async def acompletion(
        "top_p": top_p,
        "n": n,
        "stream": stream,
+        "stream_options": stream_options,
        "stop": stop,
        "max_tokens": max_tokens,
        "presence_penalty": presence_penalty,
@ -305,6 +309,7 @@ async def acompletion(
            or custom_llm_provider == "deepinfra"
            or custom_llm_provider == "perplexity"
            or custom_llm_provider == "groq"
+            or custom_llm_provider == "deepseek"
            or custom_llm_provider == "text-completion-openai"
            or custom_llm_provider == "huggingface"
            or custom_llm_provider == "ollama"
@ -457,6 +462,7 @@ def completion(
    top_p: Optional[float] = None,
    n: Optional[int] = None,
    stream: Optional[bool] = None,
+    stream_options: Optional[dict] = None,
    stop=None,
    max_tokens: Optional[int] = None,
    presence_penalty: Optional[float] = None,
@ -496,6 +502,7 @@ def completion(
        top_p (float, optional): The top-p parameter for nucleus sampling (default is 1.0).
        n (int, optional): The number of completions to generate (default is 1).
        stream (bool, optional): If True, return a streaming response (default is False).
+        stream_options (dict, optional): A dictionary containing options for the streaming response. Only set this when you set stream: true.
        stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens.
        max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity).
        presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far.
@ -573,6 +580,7 @@ def completion(
        "top_p",
        "n",
        "stream",
+        "stream_options",
        "stop",
        "max_tokens",
        "presence_penalty",
@ -648,6 +656,8 @@ def completion(
        "base_model",
        "stream_timeout",
        "supports_system_message",
+        "region_name",
+        "allowed_model_region",
    ]
    default_params = openai_params + litellm_params
    non_default_params = {
@ -783,6 +793,7 @@ def completion(
            top_p=top_p,
            n=n,
            stream=stream,
+            stream_options=stream_options,
            stop=stop,
            max_tokens=max_tokens,
            presence_penalty=presence_penalty,
@ -982,6 +993,7 @@ def completion(
            or custom_llm_provider == "deepinfra"
            or custom_llm_provider == "perplexity"
            or custom_llm_provider == "groq"
+            or custom_llm_provider == "deepseek"
            or custom_llm_provider == "anyscale"
            or custom_llm_provider == "mistral"
            or custom_llm_provider == "openai"
@ -2565,6 +2577,7 @@ async def aembedding(*args, **kwargs):
            or custom_llm_provider == "deepinfra"
            or custom_llm_provider == "perplexity"
            or custom_llm_provider == "groq"
+            or custom_llm_provider == "deepseek"
            or custom_llm_provider == "fireworks_ai"
            or custom_llm_provider == "ollama"
            or custom_llm_provider == "vertex_ai"
@ -2714,6 +2727,8 @@ def embedding(
        "ttl",
        "cache",
        "no-log",
+        "region_name",
+        "allowed_model_region",
    ]
    default_params = openai_params + litellm_params
    non_default_params = {
@ -2947,16 +2962,18 @@ def embedding(
                    model=model,  # type: ignore
                    llm_provider="ollama",  # type: ignore
                )
-            if aembedding:
-                response = ollama.ollama_aembeddings(
-                    api_base=api_base,
-                    model=model,
-                    prompts=input,
-                    encoding=encoding,
-                    logging_obj=logging,
-                    optional_params=optional_params,
-                    model_response=EmbeddingResponse(),
-                )
+            ollama_embeddings_fn = (
+                ollama.ollama_aembeddings if aembedding else ollama.ollama_embeddings
+            )
+            response = ollama_embeddings_fn(
+                api_base=api_base,
+                model=model,
+                prompts=input,
+                encoding=encoding,
+                logging_obj=logging,
+                optional_params=optional_params,
+                model_response=EmbeddingResponse(),
+            )
        elif custom_llm_provider == "sagemaker":
            response = sagemaker.embedding(
                model=model,
@ -3085,11 +3102,13 @@ async def atext_completion(*args, **kwargs):
            or custom_llm_provider == "deepinfra"
            or custom_llm_provider == "perplexity"
            or custom_llm_provider == "groq"
+            or custom_llm_provider == "deepseek"
            or custom_llm_provider == "fireworks_ai"
            or custom_llm_provider == "text-completion-openai"
            or custom_llm_provider == "huggingface"
            or custom_llm_provider == "ollama"
            or custom_llm_provider == "vertex_ai"
+            or custom_llm_provider in litellm.openai_compatible_providers
        ):  # currently implemented aiohttp calls for just azure and openai, soon all.
            # Await normally
            response = await loop.run_in_executor(None, func_with_context)
@ -3120,6 +3139,8 @@ async def atext_completion(*args, **kwargs):
            ## TRANSLATE CHAT TO TEXT FORMAT ##
            if isinstance(response, TextCompletionResponse):
                return response
+            elif asyncio.iscoroutine(response):
+                response = await response

            text_completion_response = TextCompletionResponse()
            text_completion_response["id"] = response.get("id", None)
@ -3581,6 +3602,8 @@ def image_generation(
            "caching_groups",
            "ttl",
            "cache",
+            "region_name",
+            "allowed_model_region",
        ]
        default_params = openai_params + litellm_params
        non_default_params = {
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -739,6 +739,24 @@
        "litellm_provider": "mistral",
        "mode": "embedding"
    },
+    "deepseek-chat": {
+        "max_tokens": 4096,
+        "max_input_tokens": 32000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00000014,
+        "output_cost_per_token": 0.00000028,
+        "litellm_provider": "deepseek",
+        "mode": "chat"
+    },
+    "deepseek-coder": {
+        "max_tokens": 4096,
+        "max_input_tokens": 16000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00000014,
+        "output_cost_per_token": 0.00000028,
+        "litellm_provider": "deepseek",
+        "mode": "chat"
+    },
    "groq/llama2-70b-4096": {
        "max_tokens": 4096,
        "max_input_tokens": 4096,
@ -1060,8 +1078,8 @@
        "max_tokens": 8192,
        "max_input_tokens": 1000000,
        "max_output_tokens": 8192,
-        "input_cost_per_token": 0, 
-        "output_cost_per_token": 0,
+        "input_cost_per_token": 0.000000625, 
+        "output_cost_per_token": 0.000001875,
        "litellm_provider": "vertex_ai-language-models",
        "mode": "chat",
        "supports_function_calling": true,
@ -1072,8 +1090,8 @@
        "max_tokens": 8192,
        "max_input_tokens": 1000000,
        "max_output_tokens": 8192,
-        "input_cost_per_token": 0, 
-        "output_cost_per_token": 0,
+        "input_cost_per_token": 0.000000625, 
+        "output_cost_per_token": 0.000001875,
        "litellm_provider": "vertex_ai-language-models",
        "mode": "chat",
        "supports_function_calling": true,
@ -1084,8 +1102,8 @@
        "max_tokens": 8192,
        "max_input_tokens": 1000000,
        "max_output_tokens": 8192,
-        "input_cost_per_token": 0,
-        "output_cost_per_token": 0,
+        "input_cost_per_token": 0.000000625, 
+        "output_cost_per_token": 0.000001875,
        "litellm_provider": "vertex_ai-language-models",
        "mode": "chat",
        "supports_function_calling": true,
--- a/litellm/proxy/_experimental/out/404.html
+++ b/litellm/proxy/_experimental/out/404.html
--- a/litellm/proxy/_experimental/out/_next/static/K8KXTbmuI2ArWjjdMi2iq/_buildManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/K8KXTbmuI2ArWjjdMi2iq/_buildManifest.js
--- a/litellm/proxy/_experimental/out/_next/static/K8KXTbmuI2ArWjjdMi2iq/_ssgManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/K8KXTbmuI2ArWjjdMi2iq/_ssgManifest.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/566-ccd699ab19124658.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/566-ccd699ab19124658.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/layout-94ae1345f5d85446.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/layout-94ae1345f5d85446.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/page-c804e862b63be987.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/page-c804e862b63be987.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/page-d9bdfedbff191985.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/page-d9bdfedbff191985.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/webpack-5b257e1ab47d4b4a.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/webpack-5b257e1ab47d4b4a.js
@ -1 +1 @@
-!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function d(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={id:e,loaded:!1,exports:{}},r=!0;try{a[e](n,n.exports,d),r=!1}finally{r&&delete l[e]}return n.loaded=!0,n.exports}d.m=a,e=[],d.O=function(t,n,r,o){if(n){o=o||0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(d.O).every(function(e){return d.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},d.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return d.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},d.t=function(e,r){if(1&r&&(e=this(e)),8&r||"object"==typeof e&&e&&(4&r&&e.__esModule||16&r&&"function"==typeof e.then))return e;var o=Object.create(null);d.r(o);var u={};t=t||[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},d.d(o,u),o},d.d=function(e,t){for(var n in t)d.o(t,n)&&!d.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},d.f={},d.e=function(e){return Promise.all(Object.keys(d.f).reduce(function(t,n){return d.f[n](e,t),t},[]))},d.u=function(e){},d.miniCssF=function(e){return"static/css/00c2ddbcd01819c0.css"},d.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||Function("return this")()}catch(e){if("object"==typeof window)return window}}(),d.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",d.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e||l.getAttribute("data-webpack")==o+n){i=l;break}}i||(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,d.nc&&i.setAttribute("nonce",d.nc),i.setAttribute("data-webpack",o+n),i.src=d.tu(e)),r[e]=[t];var s=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(s.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=s.bind(null,i.onerror),i.onload=s.bind(null,i.onload),c&&document.head.appendChild(i)},d.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.nmd=function(e){return e.paths=[],e.children||(e.children=[]),e},d.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},d.tu=function(e){return d.tt().createScriptURL(e)},d.p="/ui/_next/",i={272:0},d.f.j=function(e,t){var n=d.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=d.p+d.u(e),u=Error();d.l(o,function(t){if(d.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},d.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)d.o(u,n)&&(d.m[n]=u[n]);if(c)var a=c(d)}for(e&&e(t);f<o.length;f++)r=o[f],d.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return d.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E||[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();
+!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function d(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={id:e,loaded:!1,exports:{}},r=!0;try{a[e](n,n.exports,d),r=!1}finally{r&&delete l[e]}return n.loaded=!0,n.exports}d.m=a,e=[],d.O=function(t,n,r,o){if(n){o=o||0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(d.O).every(function(e){return d.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},d.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return d.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},d.t=function(e,r){if(1&r&&(e=this(e)),8&r||"object"==typeof e&&e&&(4&r&&e.__esModule||16&r&&"function"==typeof e.then))return e;var o=Object.create(null);d.r(o);var u={};t=t||[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},d.d(o,u),o},d.d=function(e,t){for(var n in t)d.o(t,n)&&!d.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},d.f={},d.e=function(e){return Promise.all(Object.keys(d.f).reduce(function(t,n){return d.f[n](e,t),t},[]))},d.u=function(e){},d.miniCssF=function(e){return"static/css/a1602eb39f799143.css"},d.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||Function("return this")()}catch(e){if("object"==typeof window)return window}}(),d.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",d.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e||l.getAttribute("data-webpack")==o+n){i=l;break}}i||(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,d.nc&&i.setAttribute("nonce",d.nc),i.setAttribute("data-webpack",o+n),i.src=d.tu(e)),r[e]=[t];var s=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(s.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=s.bind(null,i.onerror),i.onload=s.bind(null,i.onload),c&&document.head.appendChild(i)},d.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.nmd=function(e){return e.paths=[],e.children||(e.children=[]),e},d.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},d.tu=function(e){return d.tt().createScriptURL(e)},d.p="/ui/_next/",i={272:0},d.f.j=function(e,t){var n=d.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=d.p+d.u(e),u=Error();d.l(o,function(t){if(d.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},d.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)d.o(u,n)&&(d.m[n]=u[n]);if(c)var a=c(d)}for(e&&e(t);f<o.length;f++)r=o[f],d.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return d.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E||[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();
--- a/litellm/proxy/_experimental/out/_next/static/css/00c2ddbcd01819c0.css
+++ b/litellm/proxy/_experimental/out/_next/static/css/00c2ddbcd01819c0.css
--- a/litellm/proxy/_experimental/out/_next/static/css/a1602eb39f799143.css
+++ b/litellm/proxy/_experimental/out/_next/static/css/a1602eb39f799143.css
--- a/litellm/proxy/_experimental/out/index.html
+++ b/litellm/proxy/_experimental/out/index.html
@ -1 +1 @@
-<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-202e312607f242a1.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-dafd44dfa2da140c.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-e49705773ae41779.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-202e312607f242a1.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/00c2ddbcd01819c0.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[58854,[\"936\",\"static/chunks/2f6dbc85-17d29013b8ff3da5.js\",\"142\",\"static/chunks/142-11990a208bf93746.js\",\"931\",\"static/chunks/app/page-d9bdfedbff191985.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/00c2ddbcd01819c0.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"e55gTzpa2g2-9SwXgA9Uo\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
+<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-5b257e1ab47d4b4a.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-dafd44dfa2da140c.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-e49705773ae41779.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-5b257e1ab47d4b4a.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/a1602eb39f799143.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[25539,[\"936\",\"static/chunks/2f6dbc85-17d29013b8ff3da5.js\",\"566\",\"static/chunks/566-ccd699ab19124658.js\",\"931\",\"static/chunks/app/page-c804e862b63be987.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/a1602eb39f799143.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"K8KXTbmuI2ArWjjdMi2iq\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
--- a/litellm/proxy/_experimental/out/index.txt
+++ b/litellm/proxy/_experimental/out/index.txt
@ -1,7 +1,7 @@
 2:I[77831,[],""]
-3:I[58854,["936","static/chunks/2f6dbc85-17d29013b8ff3da5.js","142","static/chunks/142-11990a208bf93746.js","931","static/chunks/app/page-d9bdfedbff191985.js"],""]
+3:I[25539,["936","static/chunks/2f6dbc85-17d29013b8ff3da5.js","566","static/chunks/566-ccd699ab19124658.js","931","static/chunks/app/page-c804e862b63be987.js"],""]
 4:I[5613,[],""]
 5:I[31778,[],""]
-0:["e55gTzpa2g2-9SwXgA9Uo",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/00c2ddbcd01819c0.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
+0:["K8KXTbmuI2ArWjjdMi2iq",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/a1602eb39f799143.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
 6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
 1:null
--- a/litellm/proxy/_super_secret_config.yaml
+++ b/litellm/proxy/_super_secret_config.yaml
@ -4,6 +4,22 @@ model_list:
    api_key: my-fake-key
    model: openai/my-fake-model
  model_name: fake-openai-endpoint
+- litellm_params:
+    api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
+    api_key: my-fake-key-2
+    model: openai/my-fake-model-2
+  model_name: fake-openai-endpoint
+- litellm_params:
+    api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
+    api_key: my-fake-key-3
+    model: openai/my-fake-model-3
+  model_name: fake-openai-endpoint
+- model_name: gpt-4
+  litellm_params:
+    model: gpt-3.5-turbo
+- litellm_params:
+    model: together_ai/codellama/CodeLlama-13b-Instruct-hf
+  model_name: CodeLlama-13b-Instruct
 router_settings:
  num_retries: 0
  enable_pre_call_checks: true
@ -15,8 +31,11 @@ router_settings:
  routing_strategy: "latency-based-routing"

 litellm_settings:
-  success_callback: ["openmeter"]
+  success_callback: ["langfuse"]

 general_settings:
  alerting: ["slack"]
-  alert_types: ["llm_exceptions"]
+  alert_types: ["llm_exceptions", "daily_reports"]
+  alerting_args:
+    daily_report_frequency: 60 # every minute
+    report_check_interval: 5 # every 5s 
--- a/litellm/proxy/_types.py
+++ b/litellm/proxy/_types.py
@ -458,6 +458,27 @@ class UpdateUserRequest(GenerateRequestBase):
        return values


+class NewEndUserRequest(LiteLLMBase):
+    user_id: str
+    alias: Optional[str] = None  # human-friendly alias
+    blocked: bool = False  # allow/disallow requests for this end-user
+    max_budget: Optional[float] = None
+    budget_id: Optional[str] = None  # give either a budget_id or max_budget
+    allowed_model_region: Optional[Literal["eu"]] = (
+        None  # require all user requests to use models in this specific region
+    )
+    default_model: Optional[str] = (
+        None  # if no equivalent model in allowed region - default all requests to this model
+    )
+
+    @root_validator(pre=True)
+    def check_user_info(cls, values):
+        if values.get("max_budget") is not None and values.get("budget_id") is not None:
+            raise ValueError("Set either 'max_budget' or 'budget_id', not both.")
+
+        return values
+
+
 class Member(LiteLLMBase):
    role: Literal["admin", "user"]
    user_id: Optional[str] = None
@ -494,6 +515,8 @@ class NewTeamRequest(TeamBase):

 class GlobalEndUsersSpend(LiteLLMBase):
    api_key: Optional[str] = None
+    startTime: Optional[datetime] = None
+    endTime: Optional[datetime] = None


 class TeamMemberAddRequest(LiteLLMBase):
@ -836,6 +859,7 @@ class UserAPIKeyAuth(

    api_key: Optional[str] = None
    user_role: Optional[Literal["proxy_admin", "app_owner", "app_user"]] = None
+    allowed_model_region: Optional[Literal["eu"]] = None

    @root_validator(pre=True)
    def check_api_key(cls, values):
@ -881,6 +905,8 @@ class LiteLLM_EndUserTable(LiteLLMBase):
    blocked: bool
    alias: Optional[str] = None
    spend: float = 0.0
+    allowed_model_region: Optional[Literal["eu"]] = None
+    default_model: Optional[str] = None
    litellm_budget_table: Optional[LiteLLM_BudgetTable] = None

    @root_validator(pre=True)
--- a/litellm/proxy/auth/auth_checks.py
+++ b/litellm/proxy/auth/auth_checks.py
@ -206,9 +206,9 @@ async def get_end_user_object(

    if end_user_id is None:
        return None
-
+    _key = "end_user_id:{}".format(end_user_id)
    # check if in cache
-    cached_user_obj = user_api_key_cache.async_get_cache(key=end_user_id)
+    cached_user_obj = await user_api_key_cache.async_get_cache(key=_key)
    if cached_user_obj is not None:
        if isinstance(cached_user_obj, dict):
            return LiteLLM_EndUserTable(**cached_user_obj)
@ -223,7 +223,14 @@ async def get_end_user_object(
        if response is None:
            raise Exception

-        return LiteLLM_EndUserTable(**response.dict())
+        # save the end-user object to cache
+        await user_api_key_cache.async_set_cache(
+            key="end_user_id:{}".format(end_user_id), value=response
+        )
+
+        _response = LiteLLM_EndUserTable(**response.dict())
+
+        return _response
    except Exception as e:  # if end-user not in db
        return None

--- a/litellm/proxy/auth/handle_jwt.py
+++ b/litellm/proxy/auth/handle_jwt.py
@ -15,6 +15,9 @@ from litellm.proxy._types import LiteLLM_JWTAuth, LiteLLM_UserTable
 from litellm.proxy.utils import PrismaClient
 from litellm.llms.custom_httpx.httpx_handler import HTTPHandler
 from typing import Optional
+from cryptography import x509
+from cryptography.hazmat.backends import default_backend
+from cryptography.hazmat.primitives import serialization


 class JWTHandler:
@ -142,8 +145,8 @@ class JWTHandler:
                public_key = keys[0]
        elif len(keys) > 1:
            for key in keys:
-                if kid is not None and key["kid"] == kid:
-                    public_key = key
+                if kid is not None and key == kid:
+                    public_key = keys[key]

        if public_key is None:
            raise Exception(
@ -153,6 +156,11 @@ class JWTHandler:
        return public_key

    async def auth_jwt(self, token: str) -> dict:
+        audience = os.getenv("JWT_AUDIENCE")
+        decode_options = None
+        if audience is None:
+            decode_options = {"verify_aud": False}
+        
        from jwt.algorithms import RSAAlgorithm

        header = jwt.get_unverified_header(token)
@ -182,7 +190,33 @@ class JWTHandler:
                    token,
                    public_key_rsa,  # type: ignore
                    algorithms=["RS256"],
-                    options={"verify_aud": False},
+                    options=decode_options,
+                    audience=audience,
+                )
+                return payload
+
+            except jwt.ExpiredSignatureError:
+                # the token is expired, do something to refresh it
+                raise Exception("Token Expired")
+            except Exception as e:
+                raise Exception(f"Validation fails: {str(e)}")
+        elif public_key is not None and isinstance(public_key, str):
+            try:
+                cert = x509.load_pem_x509_certificate(public_key.encode(), default_backend())
+
+                # Extract public key
+                key = cert.public_key().public_bytes(
+                    serialization.Encoding.PEM,
+                    serialization.PublicFormat.SubjectPublicKeyInfo
+                )
+
+                # decode the token using the public key
+                payload = jwt.decode(
+                    token,
+                    key,
+                    algorithms=["RS256"],
+                    audience=audience,
+                    options=decode_options
                )
                return payload

--- a/litellm/proxy/proxy_cli.py
+++ b/litellm/proxy/proxy_cli.py
@ -252,7 +252,7 @@ def run_server(
    if model and "ollama" in model and api_base is None:
        run_ollama_serve()
    if test_async is True:
-        import requests, concurrent, time
+        import requests, concurrent, time  # type: ignore

        api_base = f"http://{host}:{port}"

@ -418,7 +418,7 @@ def run_server(
            read from there and save it to os.env['DATABASE_URL']
            """
            try:
-                import yaml, asyncio
+                import yaml, asyncio  # type: ignore
            except:
                raise ImportError(
                    "yaml needs to be imported. Run - `pip install 'litellm[proxy]'`"
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
--- a/litellm/proxy/schema.prisma
+++ b/litellm/proxy/schema.prisma
@ -150,6 +150,8 @@ model LiteLLM_EndUserTable {
  user_id String @id
  alias    String? // admin-facing alias
  spend      Float    @default(0.0)
+  allowed_model_region String? // require all user requests to use models in this specific region
+  default_model String? // use along with 'allowed_model_region'. if no available model in region, default to this model.
  budget_id String?
  litellm_budget_table LiteLLM_BudgetTable?   @relation(fields: [budget_id], references: [budget_id])
  blocked Boolean @default(false)
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@ -73,6 +73,7 @@ class ProxyLogging:
                "llm_requests_hanging",
                "budget_alerts",
                "db_exceptions",
+                "daily_reports",
            ]
        ] = [
            "llm_exceptions",
@ -80,11 +81,13 @@ class ProxyLogging:
            "llm_requests_hanging",
            "budget_alerts",
            "db_exceptions",
+            "daily_reports",
        ]
        self.slack_alerting_instance = SlackAlerting(
            alerting_threshold=self.alerting_threshold,
            alerting=self.alerting,
            alert_types=self.alert_types,
+            internal_usage_cache=self.internal_usage_cache,
        )

    def update_values(
@ -100,9 +103,11 @@ class ProxyLogging:
                    "llm_requests_hanging",
                    "budget_alerts",
                    "db_exceptions",
+                    "daily_reports",
                ]
            ]
        ] = None,
+        alerting_args: Optional[dict] = None,
    ):
        self.alerting = alerting
        if alerting_threshold is not None:
@ -114,8 +119,12 @@ class ProxyLogging:
            alerting=self.alerting,
            alerting_threshold=self.alerting_threshold,
            alert_types=self.alert_types,
+            alerting_args=alerting_args,
        )

+        if "daily_reports" in self.alert_types:
+            litellm.callbacks.append(self.slack_alerting_instance)  # type: ignore
+
        if redis_cache is not None:
            self.internal_usage_cache.redis_cache = redis_cache

@ -293,6 +302,7 @@ class ProxyLogging:
            "budget_alerts",
            "db_exceptions",
        ],
+        request_data: Optional[dict] = None,
    ):
        """
        Alerting based on thresholds: - https://github.com/BerriAI/litellm/issues/1298
@ -322,10 +332,19 @@ class ProxyLogging:
        if _proxy_base_url is not None:
            formatted_message += f"\n\nProxy URL: `{_proxy_base_url}`"

+        extra_kwargs = {}
+        if request_data is not None:
+            _url = self.slack_alerting_instance._add_langfuse_trace_id_to_alert(
+                request_data=request_data
+            )
+            if _url is not None:
+                extra_kwargs["🪢 Langfuse Trace"] = _url
+                formatted_message += "\n\n🪢 Langfuse Trace: {}".format(_url)
+
        for client in self.alerting:
            if client == "slack":
                await self.slack_alerting_instance.send_alert(
-                    message=message, level=level, alert_type=alert_type
+                    message=message, level=level, alert_type=alert_type, **extra_kwargs
                )
            elif client == "sentry":
                if litellm.utils.sentry_sdk_instance is not None:
@ -360,6 +379,7 @@ class ProxyLogging:
                message=f"DB read/write call failed: {error_message}",
                level="High",
                alert_type="db_exceptions",
+                request_data={},
            )
        )

@ -375,7 +395,10 @@ class ProxyLogging:
            litellm.utils.capture_exception(error=original_exception)

    async def post_call_failure_hook(
-        self, original_exception: Exception, user_api_key_dict: UserAPIKeyAuth
+        self,
+        original_exception: Exception,
+        user_api_key_dict: UserAPIKeyAuth,
+        request_data: dict,
    ):
        """
        Allows users to raise custom exceptions/log when a call fails, without having to deal with parsing Request body.
@ -400,6 +423,7 @@ class ProxyLogging:
                    message=f"LLM API call failed: {str(original_exception)}",
                    level="High",
                    alert_type="llm_exceptions",
+                    request_data=request_data,
                )
            )

@ -502,7 +526,7 @@ class PrismaClient:
            finally:
                os.chdir(original_dir)
            # Now you can import the Prisma Client
-            from prisma import Prisma  # type: ignore
+            from prisma import Prisma

        self.db = Prisma()  # Client to connect to Prisma db

@ -1665,12 +1689,12 @@ def get_instance_fn(value: str, config_file_path: Optional[str] = None) -> Any:
            module_file_path = os.path.join(directory, *module_name.split("."))
            module_file_path += ".py"

-            spec = importlib.util.spec_from_file_location(module_name, module_file_path)
+            spec = importlib.util.spec_from_file_location(module_name, module_file_path)  # type: ignore
            if spec is None:
                raise ImportError(
                    f"Could not find a module specification for {module_file_path}"
                )
-            module = importlib.util.module_from_spec(spec)
+            module = importlib.util.module_from_spec(spec)  # type: ignore
            spec.loader.exec_module(module)  # type: ignore
        else:
            # Dynamically import the module
--- a/litellm/router.py
+++ b/litellm/router.py
@ -21,6 +21,7 @@ from collections import defaultdict
 from litellm.router_strategy.least_busy import LeastBusyLoggingHandler
 from litellm.router_strategy.lowest_tpm_rpm import LowestTPMLoggingHandler
 from litellm.router_strategy.lowest_latency import LowestLatencyLoggingHandler
+from litellm.router_strategy.lowest_cost import LowestCostLoggingHandler
 from litellm.router_strategy.lowest_tpm_rpm_v2 import LowestTPMLoggingHandler_v2
 from litellm.llms.custom_httpx.azure_dall_e_2 import (
    CustomHTTPTransport,
@ -31,6 +32,7 @@ from litellm.utils import (
    CustomStreamWrapper,
    get_utc_datetime,
    calculate_max_parallel_requests,
+    _is_region_eu,
 )
 import copy
 from litellm._logging import verbose_router_logger
@ -43,6 +45,7 @@ from litellm.types.router import (
    updateDeployment,
    updateLiteLLMParams,
    RetryPolicy,
+    AlertingConfig,
 )
 from litellm.integrations.custom_logger import CustomLogger

@ -98,9 +101,11 @@ class Router:
            "least-busy",
            "usage-based-routing",
            "latency-based-routing",
+            "cost-based-routing",
        ] = "simple-shuffle",
        routing_strategy_args: dict = {},  # just for latency-based routing
        semaphore: Optional[asyncio.Semaphore] = None,
+        alerting_config: Optional[AlertingConfig] = None,
    ) -> None:
        """
        Initialize the Router class with the given parameters for caching, reliability, and routing strategy.
@ -127,9 +132,9 @@ class Router:
            retry_after (int): Minimum time to wait before retrying a failed request. Defaults to 0.
            allowed_fails (Optional[int]): Number of allowed fails before adding to cooldown. Defaults to None.
            cooldown_time (float): Time to cooldown a deployment after failure in seconds. Defaults to 1.
-            routing_strategy (Literal["simple-shuffle", "least-busy", "usage-based-routing", "latency-based-routing"]): Routing strategy. Defaults to "simple-shuffle".
+            routing_strategy (Literal["simple-shuffle", "least-busy", "usage-based-routing", "latency-based-routing", "cost-based-routing"]): Routing strategy. Defaults to "simple-shuffle".
            routing_strategy_args (dict): Additional args for latency-based routing. Defaults to {}.
-
+            alerting_config (AlertingConfig): Slack alerting configuration. Defaults to None.
        Returns:
            Router: An instance of the litellm.Router class.

@ -314,6 +319,9 @@ class Router:
        self.model_group_retry_policy: Optional[Dict[str, RetryPolicy]] = (
            model_group_retry_policy
        )
+        self.alerting_config: Optional[AlertingConfig] = alerting_config
+        if self.alerting_config is not None:
+            self._initialize_alerting()

    def routing_strategy_init(self, routing_strategy: str, routing_strategy_args: dict):
        if routing_strategy == "least-busy":
@ -347,6 +355,14 @@ class Router:
            )
            if isinstance(litellm.callbacks, list):
                litellm.callbacks.append(self.lowestlatency_logger)  # type: ignore
+        elif routing_strategy == "cost-based-routing":
+            self.lowestcost_logger = LowestCostLoggingHandler(
+                router_cache=self.cache,
+                model_list=self.model_list,
+                routing_args={},
+            )
+            if isinstance(litellm.callbacks, list):
+                litellm.callbacks.append(self.lowestcost_logger)  # type: ignore

    def print_deployment(self, deployment: dict):
        """
@ -1847,6 +1863,10 @@ class Router:
                self.cache.set_cache(
                    value=cached_value, key=cooldown_key, ttl=cooldown_time
                )
+
+            self.send_deployment_cooldown_alert(
+                deployment_id=deployment, exception_status=exception_status
+            )
        else:
            self.failed_calls.set_cache(
                key=deployment, value=updated_fails, ttl=cooldown_time
@ -1980,7 +2000,11 @@ class Router:
            # user can pass vars directly or they can pas os.environ/AZURE_API_KEY, in which case we will read the env
            # we do this here because we init clients for Azure, OpenAI and we need to set the right key
            api_key = litellm_params.get("api_key") or default_api_key
-            if api_key and api_key.startswith("os.environ/"):
+            if (
+                api_key
+                and isinstance(api_key, str)
+                and api_key.startswith("os.environ/")
+            ):
                api_key_env_name = api_key.replace("os.environ/", "")
                api_key = litellm.get_secret(api_key_env_name)
                litellm_params["api_key"] = api_key
@ -2004,6 +2028,7 @@ class Router:
            if (
                is_azure_ai_studio_model == True
                and api_base is not None
+                and isinstance(api_base, str)
                and not api_base.endswith("/v1/")
            ):
                # check if it ends with a trailing slash
@ -2084,13 +2109,14 @@ class Router:
                organization = litellm.get_secret(organization_env_name)
                litellm_params["organization"] = organization

-            if "azure" in model_name:
-                if api_base is None:
+            if "azure" in model_name and isinstance(api_key, str):
+                if api_base is None or not isinstance(api_base, str):
                    raise ValueError(
                        f"api_base is required for Azure OpenAI. Set it on your config. Model - {model}"
                    )
                if api_version is None:
                    api_version = "2023-07-01-preview"
+
                if "gateway.ai.cloudflare.com" in api_base:
                    if not api_base.endswith("/"):
                        api_base += "/"
@ -2513,7 +2539,7 @@ class Router:
            self.default_deployment = deployment.to_json(exclude_none=True)

        # Azure GPT-Vision Enhancements, users can pass os.environ/
-        data_sources = deployment.litellm_params.get("dataSources", [])
+        data_sources = deployment.litellm_params.get("dataSources", []) or []

        for data_source in data_sources:
            params = data_source.get("parameters", {})
@ -2530,6 +2556,22 @@ class Router:
        # init OpenAI, Azure clients
        self.set_client(model=deployment.to_json(exclude_none=True))

+        # set region (if azure model)
+        try:
+            if "azure" in deployment.litellm_params.model:
+                region = litellm.utils.get_model_region(
+                    litellm_params=deployment.litellm_params, mode=None
+                )
+
+                deployment.litellm_params.region_name = region
+        except Exception as e:
+            verbose_router_logger.error(
+                "Unable to get the region for azure model - {}, {}".format(
+                    deployment.litellm_params.model, str(e)
+                )
+            )
+            pass  # [NON-BLOCKING]
+
        return deployment

    def add_deployment(self, deployment: Deployment) -> Optional[Deployment]:
@ -2557,6 +2599,38 @@ class Router:
        self.model_names.append(deployment.model_name)
        return deployment

+    def upsert_deployment(self, deployment: Deployment) -> Deployment:
+        """
+        Add or update deployment
+        Parameters:
+        - deployment: Deployment - the deployment to be added to the Router
+
+        Returns:
+        - The added/updated deployment
+        """
+        # check if deployment already exists
+
+        if deployment.model_info.id in self.get_model_ids():
+            # remove the previous deployment
+            removal_idx: Optional[int] = None
+            for idx, model in enumerate(self.model_list):
+                if model["model_info"]["id"] == deployment.model_info.id:
+                    removal_idx = idx
+
+            if removal_idx is not None:
+                self.model_list.pop(removal_idx)
+
+        # add to model list
+        _deployment = deployment.to_json(exclude_none=True)
+        self.model_list.append(_deployment)
+
+        # initialize client
+        self._add_deployment(deployment=deployment)
+
+        # add to model names
+        self.model_names.append(deployment.model_name)
+        return deployment
+
    def delete_deployment(self, id: str) -> Optional[Deployment]:
        """
        Parameters:
@ -2580,11 +2654,21 @@ class Router:
        except:
            return None

-    def get_deployment(self, model_id: str):
+    def get_deployment(self, model_id: str) -> Optional[Deployment]:
+        """
+        Returns -> Deployment or None
+
+        Raise Exception -> if model found in invalid format
+        """
        for model in self.model_list:
            if "model_info" in model and "id" in model["model_info"]:
                if model_id == model["model_info"]["id"]:
-                    return model
+                    if isinstance(model, dict):
+                        return Deployment(**model)
+                    elif isinstance(model, Deployment):
+                        return model
+                    else:
+                        raise Exception("Model invalid format - {}".format(type(model)))
        return None

    def get_model_info(self, id: str) -> Optional[dict]:
@ -2597,7 +2681,10 @@ class Router:
                    return model
        return None

-    def get_model_ids(self):
+    def get_model_ids(self) -> List[str]:
+        """
+        Returns list of model id's.
+        """
        ids = []
        for model in self.model_list:
            if "model_info" in model and "id" in model["model_info"]:
@ -2605,7 +2692,7 @@ class Router:
                ids.append(id)
        return ids

-    def get_model_names(self):
+    def get_model_names(self) -> List[str]:
        return self.model_names

    def get_model_list(self):
@ -2631,6 +2718,7 @@ class Router:
            "retry_after",
            "fallbacks",
            "context_window_fallbacks",
+            "model_group_retry_policy",
        ]

        for var in vars_to_include:
@ -2656,6 +2744,7 @@ class Router:
            "retry_after",
            "fallbacks",
            "context_window_fallbacks",
+            "model_group_retry_policy",
        ]

        _int_settings = [
@ -2754,14 +2843,17 @@ class Router:
        model: str,
        healthy_deployments: List,
        messages: List[Dict[str, str]],
+        allowed_model_region: Optional[Literal["eu"]] = None,
    ):
        """
        Filter out model in model group, if:

        - model context window < message length
        - filter models above rpm limits
+        - if region given, filter out models not in that region / unknown region
        - [TODO] function call and model doesn't support function calling
        """
+
        verbose_router_logger.debug(
            f"Starting Pre-call checks for deployments in model={model}"
        )
@ -2812,9 +2904,9 @@ class Router:
            except Exception as e:
                verbose_router_logger.debug("An error occurs - {}".format(str(e)))

-            ## RPM CHECK ##
            _litellm_params = deployment.get("litellm_params", {})
            model_id = deployment.get("model_info", {}).get("id", "")
+            ## RPM CHECK ##
            ### get local router cache ###
            current_request_cache_local = (
                self.cache.get_cache(key=model_id, local_only=True) or 0
@ -2842,6 +2934,28 @@ class Router:
                        _rate_limit_error = True
                        continue

+            ## REGION CHECK ##
+            if allowed_model_region is not None:
+                if _litellm_params.get("region_name") is not None and isinstance(
+                    _litellm_params["region_name"], str
+                ):
+                    # check if in allowed_model_region
+                    if (
+                        _is_region_eu(model_region=_litellm_params["region_name"])
+                        == False
+                    ):
+                        invalid_model_indices.append(idx)
+                        continue
+                else:
+                    verbose_router_logger.debug(
+                        "Filtering out model - {}, as model_region=None, and allowed_model_region={}".format(
+                            model_id, allowed_model_region
+                        )
+                    )
+                    # filter out since region unknown, and user wants to filter for specific region
+                    invalid_model_indices.append(idx)
+                    continue
+
        if len(invalid_model_indices) == len(_returned_deployments):
            """
            - no healthy deployments available b/c context window checks or rate limit error
@ -2943,6 +3057,7 @@ class Router:
        if (
            self.routing_strategy != "usage-based-routing-v2"
            and self.routing_strategy != "simple-shuffle"
+            and self.routing_strategy != "cost-based-routing"
        ):  # prevent regressions for other routing strategies, that don't have async get available deployments implemented.
            return self.get_available_deployment(
                model=model,
@ -2980,10 +3095,31 @@ class Router:

        # filter pre-call checks
        if self.enable_pre_call_checks and messages is not None:
-            healthy_deployments = self._pre_call_checks(
-                model=model, healthy_deployments=healthy_deployments, messages=messages
+            _allowed_model_region = (
+                request_kwargs.get("allowed_model_region")
+                if request_kwargs is not None
+                else None
            )

+            if _allowed_model_region == "eu":
+                healthy_deployments = self._pre_call_checks(
+                    model=model,
+                    healthy_deployments=healthy_deployments,
+                    messages=messages,
+                    allowed_model_region=_allowed_model_region,
+                )
+            else:
+                verbose_router_logger.debug(
+                    "Ignoring given 'allowed_model_region'={}. Only 'eu' is allowed".format(
+                        _allowed_model_region
+                    )
+                )
+                healthy_deployments = self._pre_call_checks(
+                    model=model,
+                    healthy_deployments=healthy_deployments,
+                    messages=messages,
+                )
+
        if len(healthy_deployments) == 0:
            raise ValueError(
                f"{RouterErrors.no_deployments_available.value}, passed model={model}"
@ -2999,6 +3135,16 @@ class Router:
                messages=messages,
                input=input,
            )
+        if (
+            self.routing_strategy == "cost-based-routing"
+            and self.lowestcost_logger is not None
+        ):
+            deployment = await self.lowestcost_logger.async_get_available_deployments(
+                model_group=model,
+                healthy_deployments=healthy_deployments,
+                messages=messages,
+                input=input,
+            )
        elif self.routing_strategy == "simple-shuffle":
            # if users pass rpm or tpm, we do a random weighted pick - based on rpm/tpm
            ############## Check if we can do a RPM/TPM based weighted pick #################
@ -3266,6 +3412,8 @@ class Router:

        if retry_policy is None:
            return None
+        if isinstance(retry_policy, dict):
+            retry_policy = RetryPolicy(**retry_policy)
        if (
            isinstance(exception, litellm.BadRequestError)
            and retry_policy.BadRequestErrorRetries is not None
@ -3292,6 +3440,56 @@ class Router:
        ):
            return retry_policy.ContentPolicyViolationErrorRetries

+    def _initialize_alerting(self):
+        from litellm.integrations.slack_alerting import SlackAlerting
+
+        router_alerting_config: AlertingConfig = self.alerting_config
+
+        _slack_alerting_logger = SlackAlerting(
+            alerting_threshold=router_alerting_config.alerting_threshold,
+            alerting=["slack"],
+            default_webhook_url=router_alerting_config.webhook_url,
+        )
+
+        litellm.callbacks.append(_slack_alerting_logger)
+        litellm.success_callback.append(
+            _slack_alerting_logger.response_taking_too_long_callback
+        )
+        print("\033[94m\nInitialized Alerting for litellm.Router\033[0m\n")  # noqa
+
+    def send_deployment_cooldown_alert(
+        self, deployment_id: str, exception_status: Union[str, int]
+    ):
+        try:
+            from litellm.proxy.proxy_server import proxy_logging_obj
+
+            # trigger slack alert saying deployment is in cooldown
+            if (
+                proxy_logging_obj is not None
+                and proxy_logging_obj.alerting is not None
+                and "slack" in proxy_logging_obj.alerting
+            ):
+                _deployment = self.get_deployment(model_id=deployment_id)
+                if _deployment is None:
+                    return
+
+                _litellm_params = _deployment["litellm_params"]
+                temp_litellm_params = copy.deepcopy(_litellm_params)
+                temp_litellm_params = dict(temp_litellm_params)
+                _model_name = _deployment.get("model_name", None)
+                _api_base = litellm.get_api_base(
+                    model=_model_name, optional_params=temp_litellm_params
+                )
+                asyncio.create_task(
+                    proxy_logging_obj.slack_alerting_instance.send_alert(
+                        message=f"Router: Cooling down deployment: {_api_base}, for {self.cooldown_time} seconds. Got exception: {str(exception_status)}",
+                        alert_type="cooldown_deployment",
+                        level="Low",
+                    )
+                )
+        except Exception as e:
+            pass
+
    def flush_cache(self):
        litellm.cache = None
        self.cache.flush_cache()
--- a/litellm/router_strategy/least_busy.py
+++ b/litellm/router_strategy/least_busy.py
@ -6,7 +6,7 @@
 #   - use litellm.success + failure callbacks to log when a request completed
 #   - in get_available_deployment, for a given model group name -> pick based on traffic

-import dotenv, os, requests, random
+import dotenv, os, requests, random  # type: ignore
 from typing import Optional

 dotenv.load_dotenv()  # Loading env variables using dotenv
--- a/litellm/router_strategy/lowest_cost.py
+++ b/litellm/router_strategy/lowest_cost.py
@ -0,0 +1,350 @@
+#### What this does ####
+#   picks based on response time (for streaming, this is time to first token)
+from pydantic import BaseModel, Extra, Field, root_validator
+import dotenv, os, requests, random  # type: ignore
+from typing import Optional, Union, List, Dict
+from datetime import datetime, timedelta
+import random
+
+dotenv.load_dotenv()  # Loading env variables using dotenv
+import traceback
+from litellm.caching import DualCache
+from litellm.integrations.custom_logger import CustomLogger
+from litellm._logging import verbose_router_logger
+from litellm import ModelResponse
+from litellm import token_counter
+import litellm
+
+
+class LiteLLMBase(BaseModel):
+    """
+    Implements default functions, all pydantic objects should have.
+    """
+
+    def json(self, **kwargs):
+        try:
+            return self.model_dump()  # noqa
+        except:
+            # if using pydantic v1
+            return self.dict()
+
+
+class LowestCostLoggingHandler(CustomLogger):
+    test_flag: bool = False
+    logged_success: int = 0
+    logged_failure: int = 0
+
+    def __init__(
+        self, router_cache: DualCache, model_list: list, routing_args: dict = {}
+    ):
+        self.router_cache = router_cache
+        self.model_list = model_list
+
+    async def log_success_event(self, kwargs, response_obj, start_time, end_time):
+        try:
+            """
+            Update usage on success
+            """
+            if kwargs["litellm_params"].get("metadata") is None:
+                pass
+            else:
+                model_group = kwargs["litellm_params"]["metadata"].get(
+                    "model_group", None
+                )
+
+                id = kwargs["litellm_params"].get("model_info", {}).get("id", None)
+                if model_group is None or id is None:
+                    return
+                elif isinstance(id, int):
+                    id = str(id)
+
+                # ------------
+                # Setup values
+                # ------------
+                """
+                {
+                    {model_group}_map: {
+                        id: {
+                            f"{date:hour:minute}" : {"tpm": 34, "rpm": 3}
+                        }
+                    }
+                }
+                """
+                current_date = datetime.now().strftime("%Y-%m-%d")
+                current_hour = datetime.now().strftime("%H")
+                current_minute = datetime.now().strftime("%M")
+                precise_minute = f"{current_date}-{current_hour}-{current_minute}"
+                cost_key = f"{model_group}_map"
+
+                response_ms: timedelta = end_time - start_time
+
+                final_value = response_ms
+                total_tokens = 0
+
+                if isinstance(response_obj, ModelResponse):
+                    completion_tokens = response_obj.usage.completion_tokens
+                    total_tokens = response_obj.usage.total_tokens
+                    final_value = float(response_ms.total_seconds() / completion_tokens)
+
+                # ------------
+                # Update usage
+                # ------------
+
+                request_count_dict = (
+                    await self.router_cache.async_get_cache(key=cost_key) or {}
+                )
+
+                # check local result first
+
+                if id not in request_count_dict:
+                    request_count_dict[id] = {}
+
+                if precise_minute not in request_count_dict[id]:
+                    request_count_dict[id][precise_minute] = {}
+
+                if precise_minute not in request_count_dict[id]:
+                    request_count_dict[id][precise_minute] = {}
+
+                ## TPM
+                request_count_dict[id][precise_minute]["tpm"] = (
+                    request_count_dict[id][precise_minute].get("tpm", 0) + total_tokens
+                )
+
+                ## RPM
+                request_count_dict[id][precise_minute]["rpm"] = (
+                    request_count_dict[id][precise_minute].get("rpm", 0) + 1
+                )
+
+                await self.router_cache.async_set_cache(
+                    key=cost_key, value=request_count_dict
+                )
+
+                ### TESTING ###
+                if self.test_flag:
+                    self.logged_success += 1
+        except Exception as e:
+            traceback.print_exc()
+            pass
+
+    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
+        try:
+            """
+            Update cost usage on success
+            """
+            if kwargs["litellm_params"].get("metadata") is None:
+                pass
+            else:
+                model_group = kwargs["litellm_params"]["metadata"].get(
+                    "model_group", None
+                )
+
+                id = kwargs["litellm_params"].get("model_info", {}).get("id", None)
+                if model_group is None or id is None:
+                    return
+                elif isinstance(id, int):
+                    id = str(id)
+
+                # ------------
+                # Setup values
+                # ------------
+                """
+                {
+                    {model_group}_map: {
+                        id: {
+                            "cost": [..]
+                            f"{date:hour:minute}" : {"tpm": 34, "rpm": 3}
+                        }
+                    }
+                }
+                """
+                cost_key = f"{model_group}_map"
+
+                current_date = datetime.now().strftime("%Y-%m-%d")
+                current_hour = datetime.now().strftime("%H")
+                current_minute = datetime.now().strftime("%M")
+                precise_minute = f"{current_date}-{current_hour}-{current_minute}"
+
+                response_ms: timedelta = end_time - start_time
+
+                final_value = response_ms
+                total_tokens = 0
+
+                if isinstance(response_obj, ModelResponse):
+                    completion_tokens = response_obj.usage.completion_tokens
+                    total_tokens = response_obj.usage.total_tokens
+                    final_value = float(response_ms.total_seconds() / completion_tokens)
+
+                # ------------
+                # Update usage
+                # ------------
+
+                request_count_dict = (
+                    await self.router_cache.async_get_cache(key=cost_key) or {}
+                )
+
+                if id not in request_count_dict:
+                    request_count_dict[id] = {}
+                if precise_minute not in request_count_dict[id]:
+                    request_count_dict[id][precise_minute] = {}
+
+                ## TPM
+                request_count_dict[id][precise_minute]["tpm"] = (
+                    request_count_dict[id][precise_minute].get("tpm", 0) + total_tokens
+                )
+
+                ## RPM
+                request_count_dict[id][precise_minute]["rpm"] = (
+                    request_count_dict[id][precise_minute].get("rpm", 0) + 1
+                )
+
+                await self.router_cache.async_set_cache(
+                    key=cost_key, value=request_count_dict
+                )  # reset map within window
+
+                ### TESTING ###
+                if self.test_flag:
+                    self.logged_success += 1
+        except Exception as e:
+            traceback.print_exc()
+            pass
+
+    async def async_get_available_deployments(
+        self,
+        model_group: str,
+        healthy_deployments: list,
+        messages: Optional[List[Dict[str, str]]] = None,
+        input: Optional[Union[str, List]] = None,
+        request_kwargs: Optional[Dict] = None,
+    ):
+        """
+        Returns a deployment with the lowest cost
+        """
+        cost_key = f"{model_group}_map"
+
+        request_count_dict = await self.router_cache.async_get_cache(key=cost_key) or {}
+
+        # -----------------------
+        # Find lowest used model
+        # ----------------------
+        lowest_cost = float("inf")
+
+        current_date = datetime.now().strftime("%Y-%m-%d")
+        current_hour = datetime.now().strftime("%H")
+        current_minute = datetime.now().strftime("%M")
+        precise_minute = f"{current_date}-{current_hour}-{current_minute}"
+
+        deployment = None
+
+        if request_count_dict is None:  # base case
+            return
+
+        all_deployments = request_count_dict
+        for d in healthy_deployments:
+            ## if healthy deployment not yet used
+            if d["model_info"]["id"] not in all_deployments:
+                all_deployments[d["model_info"]["id"]] = {
+                    precise_minute: {"tpm": 0, "rpm": 0},
+                }
+
+        try:
+            input_tokens = token_counter(messages=messages, text=input)
+        except:
+            input_tokens = 0
+
+        # randomly sample from all_deployments, incase all deployments have latency=0.0
+        _items = all_deployments.items()
+
+        ### GET AVAILABLE DEPLOYMENTS ### filter out any deployments > tpm/rpm limits
+        potential_deployments = []
+        _cost_per_deployment = {}
+        for item, item_map in all_deployments.items():
+            ## get the item from model list
+            _deployment = None
+            for m in healthy_deployments:
+                if item == m["model_info"]["id"]:
+                    _deployment = m
+
+            if _deployment is None:
+                continue  # skip to next one
+
+            _deployment_tpm = (
+                _deployment.get("tpm", None)
+                or _deployment.get("litellm_params", {}).get("tpm", None)
+                or _deployment.get("model_info", {}).get("tpm", None)
+                or float("inf")
+            )
+
+            _deployment_rpm = (
+                _deployment.get("rpm", None)
+                or _deployment.get("litellm_params", {}).get("rpm", None)
+                or _deployment.get("model_info", {}).get("rpm", None)
+                or float("inf")
+            )
+            item_litellm_model_name = _deployment.get("litellm_params", {}).get("model")
+            item_litellm_model_cost_map = litellm.model_cost.get(
+                item_litellm_model_name, {}
+            )
+
+            # check if user provided input_cost_per_token and output_cost_per_token in litellm_params
+            item_input_cost = None
+            item_output_cost = None
+            if _deployment.get("litellm_params", {}).get("input_cost_per_token", None):
+                item_input_cost = _deployment.get("litellm_params", {}).get(
+                    "input_cost_per_token"
+                )
+
+            if _deployment.get("litellm_params", {}).get("output_cost_per_token", None):
+                item_output_cost = _deployment.get("litellm_params", {}).get(
+                    "output_cost_per_token"
+                )
+
+            if item_input_cost is None:
+                item_input_cost = item_litellm_model_cost_map.get(
+                    "input_cost_per_token", 5.0
+                )
+
+            if item_output_cost is None:
+                item_output_cost = item_litellm_model_cost_map.get(
+                    "output_cost_per_token", 5.0
+                )
+
+            # if litellm["model"] is not in model_cost map -> use item_cost = $10
+
+            item_cost = item_input_cost + item_output_cost
+
+            item_rpm = item_map.get(precise_minute, {}).get("rpm", 0)
+            item_tpm = item_map.get(precise_minute, {}).get("tpm", 0)
+
+            verbose_router_logger.debug(
+                f"item_cost: {item_cost}, item_tpm: {item_tpm}, item_rpm: {item_rpm}, model_id: {_deployment.get('model_info', {}).get('id')}"
+            )
+
+            # -------------- #
+            # Debugging Logic
+            # -------------- #
+            # We use _cost_per_deployment to log to langfuse, slack - this is not used to make a decision on routing
+            # this helps a user to debug why the router picked a specfic deployment      #
+            _deployment_api_base = _deployment.get("litellm_params", {}).get(
+                "api_base", ""
+            )
+            if _deployment_api_base is not None:
+                _cost_per_deployment[_deployment_api_base] = item_cost
+            # -------------- #
+            # End of Debugging Logic
+            # -------------- #
+
+            if (
+                item_tpm + input_tokens > _deployment_tpm
+                or item_rpm + 1 > _deployment_rpm
+            ):  # if user passed in tpm / rpm in the model_list
+                continue
+            else:
+                potential_deployments.append((_deployment, item_cost))
+
+        if len(potential_deployments) == 0:
+            return None
+
+        potential_deployments = sorted(potential_deployments, key=lambda x: x[1])
+
+        selected_deployment = potential_deployments[0][0]
+        return selected_deployment
--- a/litellm/router_strategy/lowest_latency.py
+++ b/litellm/router_strategy/lowest_latency.py
@ -1,7 +1,7 @@
 #### What this does ####
 #   picks based on response time (for streaming, this is time to first token)
-from pydantic import BaseModel, Extra, Field, root_validator
-import dotenv, os, requests, random
+from pydantic import BaseModel, Extra, Field, root_validator  # type: ignore
+import dotenv, os, requests, random  # type: ignore
 from typing import Optional, Union, List, Dict
 from datetime import datetime, timedelta
 import random
--- a/litellm/tests/langfuse.log
+++ b/litellm/tests/langfuse.log
--- a/litellm/tests/test_acompletion.py
+++ b/litellm/tests/test_acompletion.py
@ -1,5 +1,6 @@
 import pytest
 from litellm import acompletion
+from litellm import completion


 def test_acompletion_params():
@ -7,17 +8,29 @@ def test_acompletion_params():
    from litellm.types.completion import CompletionRequest

    acompletion_params_odict = inspect.signature(acompletion).parameters
-    acompletion_params = {name: param.annotation for name, param in acompletion_params_odict.items()}
-    completion_params = {field_name: field_type for field_name, field_type in CompletionRequest.__annotations__.items()}
+    completion_params_dict = inspect.signature(completion).parameters

-    # remove kwargs
-    acompletion_params.pop("kwargs", None)
+    acompletion_params = {
+        name: param.annotation for name, param in acompletion_params_odict.items()
+    }
+    completion_params = {
+        name: param.annotation for name, param in completion_params_dict.items()
+    }

    keys_acompletion = set(acompletion_params.keys())
    keys_completion = set(completion_params.keys())

+    print(keys_acompletion)
+    print("\n\n\n")
+    print(keys_completion)
+
+    print("diff=", keys_completion - keys_acompletion)
+
    # Assert that the parameters are the same
    if keys_acompletion != keys_completion:
-        pytest.fail("The parameters of the acompletion function and the CompletionRequest class are not the same.")
+        pytest.fail(
+            "The parameters of the litellm.acompletion function and litellm.completion are not the same."
+        )
+

 # test_acompletion_params()
--- a/litellm/tests/test_alangfuse.py
+++ b/litellm/tests/test_alangfuse.py
@ -1,9 +1,11 @@
+import copy
 import json
 import sys
 import os
-import io, asyncio
+import asyncio

 import logging
+from unittest.mock import MagicMock, patch

 logging.basicConfig(level=logging.DEBUG)
 sys.path.insert(0, os.path.abspath("../.."))
@ -18,6 +20,21 @@ import time
 import pytest


+@pytest.fixture
+def langfuse_client():
+    import langfuse
+
+    langfuse_client = langfuse.Langfuse(
+        public_key=os.environ["LANGFUSE_PUBLIC_KEY"],
+        secret_key=os.environ["LANGFUSE_SECRET_KEY"],
+    )
+
+    with patch(
+        "langfuse.Langfuse", MagicMock(return_value=langfuse_client)
+    ) as mock_langfuse_client:
+        yield mock_langfuse_client()
+
+
 def search_logs(log_file_path, num_good_logs=1):
    """
    Searches the given log file for logs containing the "/api/public" string.
@ -129,21 +146,10 @@ def test_langfuse_logging_async():
        pytest.fail(f"An exception occurred - {e}")


-async def make_async_calls():
+async def make_async_calls(metadata=None, **completion_kwargs):
    tasks = []
    for _ in range(5):
-        task = asyncio.create_task(
-            litellm.acompletion(
-                model="azure/chatgpt-v-2",
-                messages=[{"role": "user", "content": "This is a test"}],
-                max_tokens=5,
-                temperature=0.7,
-                timeout=5,
-                user="langfuse_latency_test_user",
-                mock_response="It's simple to use and easy to get started",
-            )
-        )
-        tasks.append(task)
+        tasks.append(create_async_task())

    # Measure the start time before running the tasks
    start_time = asyncio.get_event_loop().time()
@ -161,9 +167,30 @@ async def make_async_calls():
    return total_time


+def create_async_task(**completion_kwargs):
+    """
+    Creates an async task for the litellm.acompletion function.
+    This is just the task, but it is not run here.
+    To run the task it must be awaited or used in other asyncio coroutine execution functions like asyncio.gather.
+    Any kwargs passed to this function will be passed to the litellm.acompletion function.
+    By default a standard set of arguments are used for the litellm.acompletion function.
+    """
+    completion_args = {
+        "model": "azure/chatgpt-v-2",
+        "messages": [{"role": "user", "content": "This is a test"}],
+        "max_tokens": 5,
+        "temperature": 0.7,
+        "timeout": 5,
+        "user": "langfuse_latency_test_user",
+        "mock_response": "It's simple to use and easy to get started",
+    }
+    completion_args.update(completion_kwargs)
+    return asyncio.create_task(litellm.acompletion(**completion_args))
+
+
@pytest.mark.asyncio
@pytest.mark.parametrize("stream", [False, True])
-async def test_langfuse_logging_without_request_response(stream):
+async def test_langfuse_logging_without_request_response(stream, langfuse_client):
    try:
        import uuid

@ -171,12 +198,8 @@ async def test_langfuse_logging_without_request_response(stream):
        litellm.set_verbose = True
        litellm.turn_off_message_logging = True
        litellm.success_callback = ["langfuse"]
-        response = await litellm.acompletion(
+        response = await create_async_task(
            model="gpt-3.5-turbo",
-            mock_response="It's simple to use and easy to get started",
-            messages=[{"role": "user", "content": "Hi 👋 - i'm claude"}],
-            max_tokens=10,
-            temperature=0.2,
            stream=stream,
            metadata={"trace_id": _unique_trace_name},
        )
@ -185,14 +208,8 @@ async def test_langfuse_logging_without_request_response(stream):
            async for chunk in response:
                print(chunk)

-        await asyncio.sleep(3)
-
-        import langfuse
-
-        langfuse_client = langfuse.Langfuse(
-            public_key=os.environ["LANGFUSE_PUBLIC_KEY"],
-            secret_key=os.environ["LANGFUSE_SECRET_KEY"],
-        )
+        langfuse_client.flush()
+        await asyncio.sleep(2)

        # get trace with _unique_trace_name
        trace = langfuse_client.get_generations(trace_id=_unique_trace_name)
@ -211,6 +228,123 @@ async def test_langfuse_logging_without_request_response(stream):
        pytest.fail(f"An exception occurred - {e}")


+@pytest.mark.asyncio
+async def test_langfuse_logging_metadata(langfuse_client):
+    """
+    Test that creates multiple traces, with a varying number of generations and sets various metadata fields
+    Confirms that no metadata that is standard within Langfuse is duplicated in the respective trace or generation metadata
+    For trace continuation certain metadata of the trace is overriden with metadata from the last generation based on the update_trace_keys field
+    Version is set for both the trace and the generation
+    Release is just set for the trace
+    Tags is just set for the trace
+    """
+    import uuid
+
+    litellm.set_verbose = True
+    litellm.success_callback = ["langfuse"]
+
+    trace_identifiers = {}
+    expected_filtered_metadata_keys = {
+        "trace_name",
+        "trace_id",
+        "existing_trace_id",
+        "trace_user_id",
+        "session_id",
+        "tags",
+        "generation_name",
+        "generation_id",
+        "prompt",
+    }
+    trace_metadata = {
+        "trace_actual_metadata_key": "trace_actual_metadata_value"
+    }  # Allows for setting the metadata on the trace
+    run_id = str(uuid.uuid4())
+    session_id = f"litellm-test-session-{run_id}"
+    trace_common_metadata = {
+        "session_id": session_id,
+        "tags": ["litellm-test-tag1", "litellm-test-tag2"],
+        "update_trace_keys": [
+            "output",
+            "trace_metadata",
+        ],  # Overwrite the following fields in the trace with the last generation's output and the trace_user_id
+        "trace_metadata": trace_metadata,
+        "gen_metadata_key": "gen_metadata_value",  # Metadata key that should not be filtered in the generation
+        "trace_release": "litellm-test-release",
+        "version": "litellm-test-version",
+    }
+    for trace_num in range(1, 3):  # Two traces
+        metadata = copy.deepcopy(trace_common_metadata)
+        trace_id = f"litellm-test-trace{trace_num}-{run_id}"
+        metadata["trace_id"] = trace_id
+        metadata["trace_name"] = trace_id
+        trace_identifiers[trace_id] = []
+        print(f"Trace: {trace_id}")
+        for generation_num in range(
+            1, trace_num + 1
+        ):  # Each trace has a number of generations equal to its trace number
+            metadata["trace_user_id"] = f"litellm-test-user{generation_num}-{run_id}"
+            generation_id = (
+                f"litellm-test-trace{trace_num}-generation-{generation_num}-{run_id}"
+            )
+            metadata["generation_id"] = generation_id
+            metadata["generation_name"] = generation_id
+            metadata["trace_metadata"][
+                "generation_id"
+            ] = generation_id  # Update to test if trace_metadata is overwritten by update trace keys
+            trace_identifiers[trace_id].append(generation_id)
+            print(f"Generation: {generation_id}")
+            response = await create_async_task(
+                model="gpt-3.5-turbo",
+                mock_response=f"{session_id}:{trace_id}:{generation_id}",
+                messages=[
+                    {
+                        "role": "user",
+                        "content": f"{session_id}:{trace_id}:{generation_id}",
+                    }
+                ],
+                max_tokens=100,
+                temperature=0.2,
+                metadata=copy.deepcopy(
+                    metadata
+                ),  # Every generation needs its own metadata, langfuse is not async/thread safe without it
+            )
+            print(response)
+            metadata["existing_trace_id"] = trace_id
+
+    langfuse_client.flush()
+    await asyncio.sleep(2)
+
+    # Tests the metadata filtering and the override of the output to be the last generation
+    for trace_id, generation_ids in trace_identifiers.items():
+        trace = langfuse_client.get_trace(id=trace_id)
+        assert trace.id == trace_id
+        assert trace.session_id == session_id
+        assert trace.metadata != trace_metadata
+        generations = list(
+            reversed(langfuse_client.get_generations(trace_id=trace_id).data)
+        )
+        assert len(generations) == len(generation_ids)
+        assert (
+            trace.input == generations[0].input
+        )  # Should be set by the first generation
+        assert (
+            trace.output == generations[-1].output
+        )  # Should be overwritten by the last generation according to update_trace_keys
+        assert (
+            trace.metadata != generations[-1].metadata
+        )  # Should be overwritten by the last generation according to update_trace_keys
+        assert trace.metadata["generation_id"] == generations[-1].id
+        assert set(trace.tags).issuperset(trace_common_metadata["tags"])
+        print("trace_from_langfuse", trace)
+        for generation_id, generation in zip(generation_ids, generations):
+            assert generation.id == generation_id
+            assert generation.trace_id == trace_id
+            assert set(generation.metadata.keys()).isdisjoint(
+                expected_filtered_metadata_keys
+            )
+            print("generation_from_langfuse", generation)
+
+
@pytest.mark.skip(reason="beta test - checking langfuse output")
 def test_langfuse_logging():
    try:
@ -570,6 +704,10 @@ def test_langfuse_existing_trace_id():
    assert initial_langfuse_trace_dict == new_langfuse_trace_dict


+@pytest.mark.skipif(
+    condition=not os.environ.get("OPENAI_API_KEY", False),
+    reason="Authentication missing for openai",
+)
 def test_langfuse_logging_tool_calling():
    litellm.set_verbose = True

--- a/litellm/tests/test_alerting.py
+++ b/litellm/tests/test_alerting.py
@ -1,7 +1,7 @@
 # What is this?
 ## Tests slack alerting on proxy logging object

-import sys
+import sys, json
 import os
 import io, asyncio
 from datetime import datetime, timedelta
@ -10,14 +10,18 @@ from datetime import datetime, timedelta
 # logging.basicConfig(level=logging.DEBUG)
 sys.path.insert(0, os.path.abspath("../.."))
 from litellm.proxy.utils import ProxyLogging
-from litellm.caching import DualCache
+from litellm.caching import DualCache, RedisCache
 import litellm
 import pytest
 import asyncio
 from unittest.mock import patch, MagicMock
 from litellm.utils import get_api_base
 from litellm.caching import DualCache
-from litellm.integrations.slack_alerting import SlackAlerting
+from litellm.integrations.slack_alerting import SlackAlerting, DeploymentMetrics
+import unittest.mock
+from unittest.mock import AsyncMock
+import pytest
+from litellm.router import AlertingConfig, Router


@pytest.mark.parametrize(
@ -61,7 +65,7 @@ async def test_get_api_base():
    end_time = datetime.now()

    time_difference_float, model, api_base, messages = (
-        _pl.slack_alerting_instance._response_taking_too_long_callback(
+        _pl.slack_alerting_instance._response_taking_too_long_callback_helper(
            kwargs={
                "model": model,
                "messages": messages,
@ -98,7 +102,10 @@ def mock_env(monkeypatch):
 # Test the __init__ method
 def test_init():
    slack_alerting = SlackAlerting(
-        alerting_threshold=32, alerting=["slack"], alert_types=["llm_exceptions"]
+        alerting_threshold=32,
+        alerting=["slack"],
+        alert_types=["llm_exceptions"],
+        internal_usage_cache=DualCache(),
    )
    assert slack_alerting.alerting_threshold == 32
    assert slack_alerting.alerting == ["slack"]
@ -116,7 +123,7 @@ from datetime import datetime, timedelta

@pytest.fixture
 def slack_alerting():
-    return SlackAlerting(alerting_threshold=1)
+    return SlackAlerting(alerting_threshold=1, internal_usage_cache=DualCache())


 # Test for hanging LLM responses
@ -185,3 +192,170 @@ async def test_send_alert(slack_alerting):
        mock_post.return_value.status_code = 200
        await slack_alerting.send_alert("Test message", "Low", "budget_alerts")
        mock_post.assert_awaited_once()
+
+
+@pytest.mark.asyncio
+async def test_daily_reports_unit_test(slack_alerting):
+    with patch.object(slack_alerting, "send_alert", new=AsyncMock()) as mock_send_alert:
+        router = litellm.Router(
+            model_list=[
+                {
+                    "model_name": "test-gpt",
+                    "litellm_params": {"model": "gpt-3.5-turbo"},
+                    "model_info": {"id": "1234"},
+                }
+            ]
+        )
+        deployment_metrics = DeploymentMetrics(
+            id="1234",
+            failed_request=False,
+            latency_per_output_token=20.3,
+            updated_at=litellm.utils.get_utc_datetime(),
+        )
+
+        updated_val = await slack_alerting.async_update_daily_reports(
+            deployment_metrics=deployment_metrics
+        )
+
+        assert updated_val == 1
+
+        await slack_alerting.send_daily_reports(router=router)
+
+        mock_send_alert.assert_awaited_once()
+
+
+@pytest.mark.asyncio
+async def test_daily_reports_completion(slack_alerting):
+    with patch.object(slack_alerting, "send_alert", new=AsyncMock()) as mock_send_alert:
+        litellm.callbacks = [slack_alerting]
+
+        # on async success
+        router = litellm.Router(
+            model_list=[
+                {
+                    "model_name": "gpt-5",
+                    "litellm_params": {
+                        "model": "gpt-3.5-turbo",
+                    },
+                }
+            ]
+        )
+
+        await router.acompletion(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "Hey, how's it going?"}],
+        )
+
+        await asyncio.sleep(3)
+        response_val = await slack_alerting.send_daily_reports(router=router)
+
+        assert response_val == True
+
+        mock_send_alert.assert_awaited_once()
+
+        # on async failure
+        router = litellm.Router(
+            model_list=[
+                {
+                    "model_name": "gpt-5",
+                    "litellm_params": {"model": "gpt-3.5-turbo", "api_key": "bad_key"},
+                }
+            ]
+        )
+
+        try:
+            await router.acompletion(
+                model="gpt-3.5-turbo",
+                messages=[{"role": "user", "content": "Hey, how's it going?"}],
+            )
+        except Exception as e:
+            pass
+
+        await asyncio.sleep(3)
+        response_val = await slack_alerting.send_daily_reports(router=router)
+
+        assert response_val == True
+
+        mock_send_alert.assert_awaited()
+
+
+@pytest.mark.asyncio
+async def test_daily_reports_redis_cache_scheduler():
+    redis_cache = RedisCache()
+    slack_alerting = SlackAlerting(
+        internal_usage_cache=DualCache(redis_cache=redis_cache)
+    )
+    router = litellm.Router(
+        model_list=[
+            {
+                "model_name": "gpt-5",
+                "litellm_params": {
+                    "model": "gpt-3.5-turbo",
+                },
+            }
+        ]
+    )
+
+    with patch.object(
+        slack_alerting, "send_alert", new=AsyncMock()
+    ) as mock_send_alert, patch.object(
+        redis_cache, "async_set_cache", new=AsyncMock()
+    ) as mock_redis_set_cache:
+        # initial call - expect empty
+        await slack_alerting._run_scheduler_helper(llm_router=router)
+
+        try:
+            json.dumps(mock_redis_set_cache.call_args[0][1])
+        except Exception as e:
+            pytest.fail(
+                "Cache value can't be json dumped - {}".format(
+                    mock_redis_set_cache.call_args[0][1]
+                )
+            )
+
+        mock_redis_set_cache.assert_awaited_once()
+
+        # second call - expect empty
+        await slack_alerting._run_scheduler_helper(llm_router=router)
+
+
+@pytest.mark.asyncio
+@pytest.mark.skip(reason="Local test. Test if slack alerts are sent.")
+async def test_send_llm_exception_to_slack():
+    from litellm.router import AlertingConfig
+
+    # on async success
+    router = litellm.Router(
+        model_list=[
+            {
+                "model_name": "gpt-3.5-turbo",
+                "litellm_params": {
+                    "model": "gpt-3.5-turbo",
+                    "api_key": "bad_key",
+                },
+            },
+            {
+                "model_name": "gpt-5-good",
+                "litellm_params": {
+                    "model": "gpt-3.5-turbo",
+                },
+            },
+        ],
+        alerting_config=AlertingConfig(
+            alerting_threshold=0.5, webhook_url=os.getenv("SLACK_WEBHOOK_URL")
+        ),
+    )
+    try:
+        await router.acompletion(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "Hey, how's it going?"}],
+        )
+    except:
+        pass
+
+    await router.acompletion(
+        model="gpt-5-good",
+        messages=[{"role": "user", "content": "Hey, how's it going?"}],
+    )
+
+    await asyncio.sleep(3)
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -118,6 +118,7 @@ def test_completion_claude():

 def test_completion_claude_3_empty_response():
    litellm.set_verbose = True
+
    messages = [
        {
            "role": "system",
@ -2167,9 +2168,9 @@ def test_completion_replicate_vicuna():

 def test_replicate_custom_prompt_dict():
    litellm.set_verbose = True
-    model_name = "replicate/meta/llama-2-70b-chat"
+    model_name = "replicate/meta/llama-2-7b"
    litellm.register_prompt_template(
-        model="replicate/meta/llama-2-70b-chat",
+        model="replicate/meta/llama-2-7b",
        initial_prompt_value="You are a good assistant",  # [OPTIONAL]
        roles={
            "system": {
@ -2199,6 +2200,7 @@ def test_replicate_custom_prompt_dict():
            repetition_penalty=0.1,
            num_retries=3,
        )
+
    except litellm.APIError as e:
        pass
    except litellm.APIConnectionError as e:
@ -3016,6 +3018,21 @@ async def test_acompletion_gemini():
            pytest.fail(f"Error occurred: {e}")


+# Deepseek tests
+def test_completion_deepseek():
+    litellm.set_verbose = True
+    model_name = "deepseek/deepseek-chat"
+    messages = [{"role": "user", "content": "Hey, how's it going?"}]
+    try:
+        response = completion(model=model_name, messages=messages)
+        # Add any assertions here to check the response
+        print(response)
+    except litellm.APIError as e:
+        pass
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+
 # Palm tests
 def test_completion_palm():
    litellm.set_verbose = True
--- a/litellm/tests/test_completion_cost.py
+++ b/litellm/tests/test_completion_cost.py
@ -231,14 +231,17 @@ def test_cost_bedrock_pricing():
    assert cost == predicted_cost


-@pytest.mark.skip(reason="AWS disabled our access")
 def test_cost_bedrock_pricing_actual_calls():
    litellm.set_verbose = True
    model = "anthropic.claude-instant-v1"
    messages = [{"role": "user", "content": "Hey, how's it going?"}]
-    response = litellm.completion(model=model, messages=messages)
-    assert response._hidden_params["region_name"] is not None
+    response = litellm.completion(
+        model=model, messages=messages, mock_response="hello cool one"
+    )
+
+    print("response", response)
    cost = litellm.completion_cost(
+        model="bedrock/anthropic.claude-instant-v1",
        completion_response=response,
        messages=[{"role": "user", "content": "Hey, how's it going?"}],
    )
--- a/litellm/tests/test_config.py
+++ b/litellm/tests/test_config.py
@ -140,6 +140,8 @@ async def test_add_existing_deployment():
            deployment_2.to_json(exclude_none=True),
        ]
    )
+
+    init_len_list = len(llm_router.model_list)
    print(f"llm_router: {llm_router}")
    master_key = "sk-1234"
    setattr(litellm.proxy.proxy_server, "llm_router", llm_router)
@ -164,7 +166,7 @@ async def test_add_existing_deployment():
    db_models = [db_model]
    num_added = pc._add_deployment(db_models=db_models)

-    assert num_added == 0
+    assert init_len_list == len(llm_router.model_list)


 litellm_params = LiteLLM_Params(
--- a/Show more
+++ b/Show more