Merge branch 'BerriAI:main' into ollama-image-handling

2025-04-28 04:04:31 +00:00 · 2024-04-13 21:42:58 +02:00 · 2024-04-13 21:42:58 +02:00 · ea117fc859
commit ea117fc859
parent 82a4232dce ca865b83e6
94 changed files with 10050 additions and 828 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -129,6 +129,7 @@ jobs:
  build_and_test:
    machine:
      image: ubuntu-2204:2023.10.1
+    resource_class: xlarge
    working_directory: ~/project
    steps:
      - checkout
@ -188,6 +189,9 @@ jobs:
              -p 4000:4000 \
              -e DATABASE_URL=$PROXY_DOCKER_DB_URL \
              -e AZURE_API_KEY=$AZURE_API_KEY \
+              -e REDIS_HOST=$REDIS_HOST \
+              -e REDIS_PASSWORD=$REDIS_PASSWORD \
+              -e REDIS_PORT=$REDIS_PORT \
              -e AZURE_FRANCE_API_KEY=$AZURE_FRANCE_API_KEY \
              -e AZURE_EUROPE_API_KEY=$AZURE_EUROPE_API_KEY \
              -e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
--- a/.dockerignore
+++ b/.dockerignore
@ -1,5 +1,5 @@
-/docs
-/cookbook
-/.circleci
-/.github
-/tests
+docs
+cookbook
+.circleci
+.github
+tests
--- a/.gitignore
+++ b/.gitignore
@ -46,3 +46,7 @@ deploy/charts/*.tgz
 litellm/proxy/vertex_key.json
 **/.vim/
 /node_modules
+kub.yaml
+loadtest_kub.yaml
+litellm/proxy/_new_secret_config.yaml
+litellm/proxy/_new_secret_config.yaml
--- a/3
+++ b/3
@ -70,5 +70,4 @@ EXPOSE 4000/tcp
 ENTRYPOINT ["litellm"]

 # Append "--detailed_debug" to the end of CMD to view detailed debug logs 
-# CMD ["--port", "4000", "--config", "./proxy_server_config.yaml"]
-CMD ["--port", "4000", "--config", "./proxy_server_config.yaml"]
+CMD ["--port", "4000"]
--- a/README.md
+++ b/README.md
@ -205,7 +205,7 @@ curl 'http://0.0.0.0:4000/key/generate' \
 | [aws - bedrock](https://docs.litellm.ai/docs/providers/bedrock)                     | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                | ✅                                                                            |
 | [google - vertex_ai [Gemini]](https://docs.litellm.ai/docs/providers/vertex)        | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
 | [google - palm](https://docs.litellm.ai/docs/providers/palm)                        | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
-| [google AI Studio - gemini](https://docs.litellm.ai/docs/providers/gemini)          | ✅                                                      |                                                                                 | ✅                                                                                  |                                                                                   |                                                                               |
+| [google AI Studio - gemini](https://docs.litellm.ai/docs/providers/gemini)          | ✅                                                      |       ✅                                                                          | ✅                                                                                  |     ✅                                                                              |                                                                               |
 | [mistral ai api](https://docs.litellm.ai/docs/providers/mistral)                    | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                | ✅                                                                            |
 | [cloudflare AI Workers](https://docs.litellm.ai/docs/providers/cloudflare_workers)  | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
 | [cohere](https://docs.litellm.ai/docs/providers/cohere)                             | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                | ✅                                                                            |
@ -220,7 +220,7 @@ curl 'http://0.0.0.0:4000/key/generate' \
 | [nlp_cloud](https://docs.litellm.ai/docs/providers/nlp_cloud)                       | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
 | [aleph alpha](https://docs.litellm.ai/docs/providers/aleph_alpha)                   | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
 | [petals](https://docs.litellm.ai/docs/providers/petals)                             | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
-| [ollama](https://docs.litellm.ai/docs/providers/ollama)                             | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
+| [ollama](https://docs.litellm.ai/docs/providers/ollama)                             | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                | ✅                                                                            |
 | [deepinfra](https://docs.litellm.ai/docs/providers/deepinfra)                       | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
 | [perplexity-ai](https://docs.litellm.ai/docs/providers/perplexity)                  | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
 | [Groq AI](https://docs.litellm.ai/docs/providers/groq)                              | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
--- a/cookbook/Proxy_Batch_Users.ipynb
+++ b/cookbook/Proxy_Batch_Users.ipynb
@ -0,0 +1,204 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "680oRk1af-xJ"
+      },
+      "source": [
+        "# Environment Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "X7TgJFn8f88p"
+      },
+      "outputs": [],
+      "source": [
+        "import csv\n",
+        "from typing import Optional\n",
+        "import httpx, json\n",
+        "import asyncio\n",
+        "\n",
+        "proxy_base_url = \"http://0.0.0.0:4000\" # 👈 SET TO PROXY URL\n",
+        "master_key = \"sk-1234\" # 👈 SET TO PROXY MASTER KEY"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "rauw8EOhgBz5"
+      },
+      "outputs": [],
+      "source": [
+        "## GLOBAL HTTP CLIENT ## - faster http calls\n",
+        "class HTTPHandler:\n",
+        "    def __init__(self, concurrent_limit=1000):\n",
+        "        # Create a client with a connection pool\n",
+        "        self.client = httpx.AsyncClient(\n",
+        "            limits=httpx.Limits(\n",
+        "                max_connections=concurrent_limit,\n",
+        "                max_keepalive_connections=concurrent_limit,\n",
+        "            )\n",
+        "        )\n",
+        "\n",
+        "    async def close(self):\n",
+        "        # Close the client when you're done with it\n",
+        "        await self.client.aclose()\n",
+        "\n",
+        "    async def get(\n",
+        "        self, url: str, params: Optional[dict] = None, headers: Optional[dict] = None\n",
+        "    ):\n",
+        "        response = await self.client.get(url, params=params, headers=headers)\n",
+        "        return response\n",
+        "\n",
+        "    async def post(\n",
+        "        self,\n",
+        "        url: str,\n",
+        "        data: Optional[dict] = None,\n",
+        "        params: Optional[dict] = None,\n",
+        "        headers: Optional[dict] = None,\n",
+        "    ):\n",
+        "        try:\n",
+        "            response = await self.client.post(\n",
+        "                url, data=data, params=params, headers=headers\n",
+        "            )\n",
+        "            return response\n",
+        "        except Exception as e:\n",
+        "            raise e\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "7LXN8zaLgOie"
+      },
+      "source": [
+        "# Import Sheet\n",
+        "\n",
+        "\n",
+        "Format: | ID | Name | Max Budget |"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "oiED0usegPGf"
+      },
+      "outputs": [],
+      "source": [
+        "async def import_sheet():\n",
+        "    tasks = []\n",
+        "    http_client = HTTPHandler()\n",
+        "    with open('my-batch-sheet.csv', 'r') as file:\n",
+        "        csv_reader = csv.DictReader(file)\n",
+        "        for row in csv_reader:\n",
+        "            task = create_user(client=http_client, user_id=row['ID'], max_budget=row['Max Budget'], user_name=row['Name'])\n",
+        "            tasks.append(task)\n",
+        "            # print(f\"ID: {row['ID']}, Name: {row['Name']}, Max Budget: {row['Max Budget']}\")\n",
+        "\n",
+        "    keys = await asyncio.gather(*tasks)\n",
+        "\n",
+        "    with open('my-batch-sheet_new.csv', 'w', newline='') as new_file:\n",
+        "        fieldnames = ['ID', 'Name', 'Max Budget', 'keys']\n",
+        "        csv_writer = csv.DictWriter(new_file, fieldnames=fieldnames)\n",
+        "        csv_writer.writeheader()\n",
+        "\n",
+        "        with open('my-batch-sheet.csv', 'r') as file:\n",
+        "            csv_reader = csv.DictReader(file)\n",
+        "            for i, row in enumerate(csv_reader):\n",
+        "                row['keys'] = keys[i]  # Add the 'keys' value from the corresponding task result\n",
+        "                csv_writer.writerow(row)\n",
+        "\n",
+        "    await http_client.close()\n",
+        "\n",
+        "asyncio.run(import_sheet())"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "E7M0Li_UgJeZ"
+      },
+      "source": [
+        "# Create Users + Keys\n",
+        "\n",
+        "- Creates a user\n",
+        "- Creates a key with max budget"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "NZudRFujf7j-"
+      },
+      "outputs": [],
+      "source": [
+        "\n",
+        "async def create_key_with_alias(client: HTTPHandler, user_id: str, max_budget: float):\n",
+        "    global proxy_base_url\n",
+        "    if not proxy_base_url.endswith(\"/\"):\n",
+        "        proxy_base_url += \"/\"\n",
+        "    url = proxy_base_url + \"key/generate\"\n",
+        "\n",
+        "    # call /key/generate\n",
+        "    print(\"CALLING /KEY/GENERATE\")\n",
+        "    response = await client.post(\n",
+        "        url=url,\n",
+        "        headers={\"Authorization\": f\"Bearer {master_key}\"},\n",
+        "        data=json.dumps({\n",
+        "            \"user_id\": user_id,\n",
+        "            \"key_alias\": f\"{user_id}-key\",\n",
+        "            \"max_budget\": max_budget # 👈 KEY CHANGE: SETS MAX BUDGET PER KEY\n",
+        "        })\n",
+        "    )\n",
+        "    print(f\"response: {response.text}\")\n",
+        "    return response.json()[\"key\"]\n",
+        "\n",
+        "async def create_user(client: HTTPHandler, user_id: str, max_budget: float, user_name: str):\n",
+        "    \"\"\"\n",
+        "    - call /user/new\n",
+        "    - create key for user\n",
+        "    \"\"\"\n",
+        "    global proxy_base_url\n",
+        "    if not proxy_base_url.endswith(\"/\"):\n",
+        "        proxy_base_url += \"/\"\n",
+        "    url = proxy_base_url + \"user/new\"\n",
+        "\n",
+        "    # call /user/new\n",
+        "    await client.post(\n",
+        "        url=url,\n",
+        "        headers={\"Authorization\": f\"Bearer {master_key}\"},\n",
+        "        data=json.dumps({\n",
+        "            \"user_id\": user_id,\n",
+        "            \"user_alias\": user_name,\n",
+        "            \"auto_create_key\": False,\n",
+        "            # \"max_budget\": max_budget # 👈 [OPTIONAL] Sets max budget per user (if you want to set a max budget across keys)\n",
+        "        })\n",
+        "    )\n",
+        "\n",
+        "    # create key for user\n",
+        "    return await create_key_with_alias(client=client, user_id=user_id, max_budget=max_budget)\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
--- a/cookbook/misc/config.yaml
+++ b/cookbook/misc/config.yaml
@ -0,0 +1,73 @@
+model_list:
+  - model_name: gpt-3.5-turbo
+    litellm_params:
+      model: azure/chatgpt-v-2
+      api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
+      api_version: "2023-05-15"
+      api_key: os.environ/AZURE_API_KEY # The `os.environ/` prefix tells litellm to read this from the env. See https://docs.litellm.ai/docs/simple_proxy#load-api-keys-from-vault
+  - model_name: gpt-3.5-turbo-large
+    litellm_params: 
+      model: "gpt-3.5-turbo-1106"
+      api_key: os.environ/OPENAI_API_KEY
+      rpm: 480
+      timeout: 300
+      stream_timeout: 60
+  - model_name: gpt-4
+    litellm_params:
+      model: azure/chatgpt-v-2
+      api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
+      api_version: "2023-05-15"
+      api_key: os.environ/AZURE_API_KEY # The `os.environ/` prefix tells litellm to read this from the env. See https://docs.litellm.ai/docs/simple_proxy#load-api-keys-from-vault
+      rpm: 480
+      timeout: 300
+      stream_timeout: 60
+  - model_name: sagemaker-completion-model
+    litellm_params:
+      model: sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4
+      input_cost_per_second: 0.000420  
+  - model_name: text-embedding-ada-002
+    litellm_params: 
+      model: azure/azure-embedding-model
+      api_key: os.environ/AZURE_API_KEY
+      api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
+      api_version: "2023-05-15"
+    model_info:
+      mode: embedding
+      base_model: text-embedding-ada-002
+  - model_name: dall-e-2
+    litellm_params:
+      model: azure/
+      api_version: 2023-06-01-preview
+      api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
+      api_key: os.environ/AZURE_API_KEY
+  - model_name: openai-dall-e-3
+    litellm_params:
+      model: dall-e-3
+  - model_name: fake-openai-endpoint
+    litellm_params:
+      model: openai/fake
+      api_key: fake-key
+      api_base: https://exampleopenaiendpoint-production.up.railway.app/
+
+litellm_settings:
+  drop_params: True
+  # max_budget: 100 
+  # budget_duration: 30d
+  num_retries: 5
+  request_timeout: 600
+  telemetry: False
+  context_window_fallbacks: [{"gpt-3.5-turbo": ["gpt-3.5-turbo-large"]}]
+
+general_settings: 
+  master_key: sk-1234 # [OPTIONAL] Use to enforce auth on proxy. See - https://docs.litellm.ai/docs/proxy/virtual_keys
+  store_model_in_db: True
+  proxy_budget_rescheduler_min_time: 60
+  proxy_budget_rescheduler_max_time: 64
+  proxy_batch_write_at: 1
+  # database_url: "postgresql://<user>:<password>@<host>:<port>/<dbname>" # [OPTIONAL] use for token-based auth to proxy
+
+# environment_variables:
+  # settings for using redis caching
+  # REDIS_HOST: redis-16337.c322.us-east-1-2.ec2.cloud.redislabs.com
+  # REDIS_PORT: "16337"
+  # REDIS_PASSWORD: 
--- a/cookbook/misc/migrate_proxy_config.py
+++ b/cookbook/misc/migrate_proxy_config.py
@ -0,0 +1,92 @@
+"""
+LiteLLM Migration Script!
+
+Takes a config.yaml and calls /model/new 
+
+Inputs:
+    - File path to config.yaml
+    - Proxy base url to your hosted proxy
+
+Step 1: Reads your config.yaml
+Step 2: reads `model_list` and loops through all models 
+Step 3: calls `<proxy-base-url>/model/new` for each model
+"""
+
+import yaml
+import requests
+
+_in_memory_os_variables = {}
+
+
+def migrate_models(config_file, proxy_base_url):
+    # Step 1: Read the config.yaml file
+    with open(config_file, "r") as f:
+        config = yaml.safe_load(f)
+
+    # Step 2: Read the model_list and loop through all models
+    model_list = config.get("model_list", [])
+    print("model_list: ", model_list)
+    for model in model_list:
+
+        model_name = model.get("model_name")
+        print("\nAdding model: ", model_name)
+        litellm_params = model.get("litellm_params", {})
+        api_base = litellm_params.get("api_base", "")
+        print("api_base on config.yaml: ", api_base)
+
+        litellm_model_name = litellm_params.get("model", "") or ""
+        if "vertex_ai/" in litellm_model_name:
+            print(f"\033[91m\nSkipping Vertex AI model\033[0m", model)
+            continue
+
+        for param, value in litellm_params.items():
+            if isinstance(value, str) and value.startswith("os.environ/"):
+                # check if value is in _in_memory_os_variables
+                if value in _in_memory_os_variables:
+                    new_value = _in_memory_os_variables[value]
+                    print(
+                        "\033[92mAlready entered value for \033[0m",
+                        value,
+                        "\033[92musing \033[0m",
+                        new_value,
+                    )
+                else:
+                    new_value = input(f"Enter value for {value}: ")
+                    _in_memory_os_variables[value] = new_value
+                litellm_params[param] = new_value
+
+        print("\nlitellm_params: ", litellm_params)
+        # Confirm before sending POST request
+        confirm = input(
+            "\033[92mDo you want to send the POST request with the above parameters? (y/n): \033[0m"
+        )
+        if confirm.lower() != "y":
+            print("Aborting POST request.")
+            exit()
+
+        # Step 3: Call <proxy-base-url>/model/new for each model
+        url = f"{proxy_base_url}/model/new"
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {master_key}",
+        }
+        data = {"model_name": model_name, "litellm_params": litellm_params}
+        print("POSTING data to proxy url", url)
+        response = requests.post(url, headers=headers, json=data)
+        if response.status_code != 200:
+            print(f"Error: {response.status_code} - {response.text}")
+            raise Exception(f"Error: {response.status_code} - {response.text}")
+
+        # Print the response for each model
+        print(
+            f"Response for model '{model_name}': Status Code:{response.status_code} - {response.text}"
+        )
+
+
+# Usage
+config_file = "config.yaml"
+proxy_base_url = "http://0.0.0.0:4000"
+master_key = "sk-1234"
+print(f"config_file: {config_file}")
+print(f"proxy_base_url: {proxy_base_url}")
+migrate_models(config_file, proxy_base_url)
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -1,10 +1,16 @@
 version: "3.9"
 services:
  litellm:
+    build:
+      context: .
+      args:
+        target: runtime
    image: ghcr.io/berriai/litellm:main-latest
-    volumes:
-      - ./proxy_server_config.yaml:/app/proxy_server_config.yaml # mount your litellm config.yaml
    ports:
-      - "4000:4000"
-    environment:
-      - AZURE_API_KEY=sk-123
+      - "4000:4000" # Map the container port to the host, change the host port if necessary
+    volumes:
+      - ./litellm-config.yaml:/app/config.yaml # Mount the local configuration file
+    # You can change the port or number of workers as per your requirements or pass any new supported CLI augument. Make sure the port passed here matches with the container port defined above in `ports` value
+    command: [ "--config", "/app/config.yaml", "--port", "4000", "--num_workers", "8" ]
+
+# ...rest of your docker-compose config if any
--- a/docs/my-website/docs/enterprise.md
+++ b/docs/my-website/docs/enterprise.md
@ -1,5 +1,5 @@
 # Enterprise
-For companies that need better security, user management and professional support
+For companies that need SSO, user management and professional support for LiteLLM Proxy

 :::info

--- a/docs/my-website/docs/providers/gemini.md
+++ b/docs/my-website/docs/providers/gemini.md
@ -95,8 +95,8 @@ print(content)
 ```

 ## Chat Models
-| Model Name       | Function Call                        | Required OS Variables    |
-|------------------|--------------------------------------|-------------------------|
-| gemini-pro       | `completion('gemini/gemini-pro', messages)` | `os.environ['GEMINI_API_KEY']` |
-| gemini-1.5-pro       | `completion('gemini/gemini-1.5-pro', messages)` | `os.environ['GEMINI_API_KEY']` |
-| gemini-pro-vision       | `completion('gemini/gemini-pro-vision', messages)` | `os.environ['GEMINI_API_KEY']` |
+| Model Name            | Function Call                                          | Required OS Variables          |
+|-----------------------|--------------------------------------------------------|--------------------------------|
+| gemini-pro            | `completion('gemini/gemini-pro', messages)`            | `os.environ['GEMINI_API_KEY']` |
+| gemini-1.5-pro-latest | `completion('gemini/gemini-1.5-pro-latest', messages)` | `os.environ['GEMINI_API_KEY']` |
+| gemini-pro-vision     | `completion('gemini/gemini-pro-vision', messages)`     | `os.environ['GEMINI_API_KEY']` |
--- a/docs/my-website/docs/providers/voyage.md
+++ b/docs/my-website/docs/providers/voyage.md
@ -25,8 +25,11 @@ All models listed here https://docs.voyageai.com/embeddings/#models-and-specific

 | Model Name               | Function Call                                                                                                                                                      |
 |--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| voyage-2 | `embedding(model="voyage/voyage-2", input)` | 
+| voyage-large-2 | `embedding(model="voyage/voyage-large-2", input)` | 
+| voyage-law-2 | `embedding(model="voyage/voyage-law-2", input)` | 
+| voyage-code-2 | `embedding(model="voyage/voyage-code-2", input)` | 
+| voyage-lite-02-instruct | `embedding(model="voyage/voyage-lite-02-instruct", input)` | 
 | voyage-01 | `embedding(model="voyage/voyage-01", input)` | 
 | voyage-lite-01 | `embedding(model="voyage/voyage-lite-01", input)` | 
-| voyage-lite-01-instruct | `embedding(model="voyage/voyage-lite-01-instruct", input)` | 
-
-
+| voyage-lite-01-instruct | `embedding(model="voyage/voyage-lite-01-instruct", input)` | 
--- a/docs/my-website/docs/proxy/demo.md
+++ b/docs/my-website/docs/proxy/demo.md
@ -0,0 +1,9 @@
+# 🎉 Demo App
+
+Here is a demo of the proxy. To log in pass in:
+
+- Username: admin
+- Password: sk-1234
+
+
+[Demo UI](https://demo.litellm.ai/ui)
--- a/docs/my-website/docs/proxy/deploy.md
+++ b/docs/my-website/docs/proxy/deploy.md
@ -666,8 +666,8 @@ services:
  litellm:
    build:
      context: .
-        args:
-          target: runtime
+      args:
+        target: runtime
    image: ghcr.io/berriai/litellm:main-latest
    ports:
      - "4000:4000" # Map the container port to the host, change the host port if necessary
--- a/docs/my-website/docs/proxy/enterprise.md
+++ b/docs/my-website/docs/proxy/enterprise.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# ✨ Enterprise Features - Content Mod
+# ✨ Enterprise Features - Content Mod, SSO

 Features here are behind a commercial license in our `/enterprise` folder. [**See Code**](https://github.com/BerriAI/litellm/tree/main/enterprise)

@ -12,16 +12,18 @@ Features here are behind a commercial license in our `/enterprise` folder. [**Se
 :::

 Features: 
+- ✅ [SSO for Admin UI](./ui.md#✨-enterprise-features)
 - ✅ Content Moderation with LLM Guard
 - ✅ Content Moderation with LlamaGuard 
 - ✅ Content Moderation with Google Text Moderations 
 - ✅ Reject calls from Blocked User list 
 - ✅ Reject calls (incoming / outgoing) with Banned Keywords (e.g. competitors)
- ✅ Don't log/store specific requests (eg confidential LLM requests)
+- ✅ Don't log/store specific requests to Langfuse, Sentry, etc. (eg confidential LLM requests)
 - ✅ Tracking Spend for Custom Tags



+
 ## Content Moderation
 ### Content Moderation with LLM Guard

@ -74,7 +76,7 @@ curl --location 'http://localhost:4000/key/generate' \
 # Returns {..'key': 'my-new-key'}
 ```

-**2. Test it!**
+**3. Test it!**

 ```bash
 curl --location 'http://0.0.0.0:4000/v1/chat/completions' \
@ -87,6 +89,76 @@ curl --location 'http://0.0.0.0:4000/v1/chat/completions' \
    }'
 ```

+#### Turn on/off per request
+
+**1. Update config**
+```yaml
+litellm_settings:
+    callbacks: ["llmguard_moderations"]
+    llm_guard_mode: "request-specific"
+```
+
+**2. Create new key**
+
+```bash
+curl --location 'http://localhost:4000/key/generate' \
+--header 'Authorization: Bearer sk-1234' \
+--header 'Content-Type: application/json' \
+--data '{
+    "models": ["fake-openai-endpoint"],
+}'
+
+# Returns {..'key': 'my-new-key'}
+```
+
+**3. Test it!**
+
+<Tabs>
+<TabItem value="openai" label="OpenAI Python v1.0.0+">
+
+```python
+import openai
+client = openai.OpenAI(
+    api_key="sk-1234",
+    base_url="http://0.0.0.0:4000"
+)
+
+# request sent to model set on litellm proxy, `litellm --model`
+response = client.chat.completions.create(
+    model="gpt-3.5-turbo",
+    messages = [
+        {
+            "role": "user",
+            "content": "this is a test request, write a short poem"
+        }
+    ],
+    extra_body={ # pass in any provider-specific param, if not supported by openai, https://docs.litellm.ai/docs/completion/input#provider-specific-params
+        "metadata": {
+            "permissions": {
+                "enable_llm_guard_check": True # 👈 KEY CHANGE
+            },
+        }
+    }
+)
+
+print(response)
+```
+</TabItem>
+<TabItem value="curl" label="Curl Request">
+
+```bash
+curl --location 'http://0.0.0.0:4000/v1/chat/completions' \
+--header 'Content-Type: application/json' \
+--header 'Authorization: Bearer my-new-key' \ # 👈 TEST KEY
+--data '{"model": "fake-openai-endpoint", "messages": [
+        {"role": "system", "content": "Be helpful"},
+        {"role": "user", "content": "What do you know?"}
+    ]
+    }'
+```
+
+</TabItem>
+</Tabs>

 ### Content Moderation with LlamaGuard 

--- a/docs/my-website/docs/proxy/team_based_routing.md
+++ b/docs/my-website/docs/proxy/team_based_routing.md
@ -99,7 +99,7 @@ Now, when you [generate keys](./virtual_keys.md) for this team-id
 curl -X POST 'http://0.0.0.0:4000/key/generate' \
 -H 'Authorization: Bearer sk-1234' \
 -H 'Content-Type: application/json' \
-D '{"team_id": "ishaans-secret-project"}'
+-d '{"team_id": "ishaans-secret-project"}'
 ```

 All requests made with these keys will log data to their team-specific logging.
--- a/docs/my-website/docs/proxy/token_auth.md
+++ b/docs/my-website/docs/proxy/token_auth.md
@ -108,6 +108,34 @@ general_settings:
  litellm_jwtauth:
    admin_jwt_scope: "litellm-proxy-admin"
 ```
+
+## Advanced - Spend Tracking (User / Team / Org)
+
+Set the field in the jwt token, which corresponds to a litellm user / team / org.
+
+```yaml
+general_settings:
+  master_key: sk-1234
+  enable_jwt_auth: True
+  litellm_jwtauth:
+    admin_jwt_scope: "litellm-proxy-admin"
+    team_id_jwt_field: "client_id" # 👈 CAN BE ANY FIELD
+    user_id_jwt_field: "sub" # 👈 CAN BE ANY FIELD
+    org_id_jwt_field: "org_id" # 👈 CAN BE ANY FIELD
+```
+
+Expected JWT: 
+
+```
+{
+  "client_id": "my-unique-team",
+  "sub": "my-unique-user",
+  "org_id": "my-unique-org"
+}
+```
+
+Now litellm will automatically update the spend for the user/team/org in the db for each call. 
+
 ### JWT Scopes

 Here's what scopes on JWT-Auth tokens look like
--- a/docs/my-website/docs/proxy/ui.md
+++ b/docs/my-website/docs/proxy/ui.md
@ -56,6 +56,9 @@ On accessing the LiteLLM UI, you will be prompted to enter your username, passwo

 ## ✨ Enterprise Features

+Features here are behind a commercial license in our `/enterprise` folder. [**See Code**](https://github.com/BerriAI/litellm/tree/main/enterprise)
+
+
 ### Setup SSO/Auth for UI

 #### Step 1: Set upperbounds for keys
--- a/docs/my-website/docs/routing.md
+++ b/docs/my-website/docs/routing.md
@ -95,12 +95,129 @@ print(response)
 - `router.image_generation()` - completion calls in OpenAI `/v1/images/generations` endpoint format
 - `router.aimage_generation()` - async image generation calls

-### Advanced
+### Advanced - Routing Strategies
 #### Routing Strategies - Weighted Pick, Rate Limit Aware, Least Busy, Latency Based

 Router provides 4 strategies for routing your calls across multiple deployments: 

 <Tabs>
+<TabItem value="usage-based-v2" label="Rate-Limit Aware v2 (ASYNC)">
+
+**🎉 NEW** This is an async implementation of usage-based-routing.
+
+**Filters out deployment if tpm/rpm limit exceeded** - If you pass in the deployment's tpm/rpm limits.
+
+Routes to **deployment with lowest TPM usage** for that minute. 
+
+In production, we use Redis to track usage (TPM/RPM) across multiple deployments. This implementation uses **async redis calls** (redis.incr and redis.mget).
+
+For Azure, your RPM = TPM/6. 
+
+<Tabs>
+<TabItem value="sdk" label="sdk">
+
+```python
+from litellm import Router 
+
+
+model_list = [{ # list of model deployments 
+	"model_name": "gpt-3.5-turbo", # model alias 
+	"litellm_params": { # params for litellm completion/embedding call 
+		"model": "azure/chatgpt-v-2", # actual model name
+		"api_key": os.getenv("AZURE_API_KEY"),
+		"api_version": os.getenv("AZURE_API_VERSION"),
+		"api_base": os.getenv("AZURE_API_BASE")
+	}, 
+    "tpm": 100000,
+	"rpm": 10000,
+}, {
+    "model_name": "gpt-3.5-turbo", 
+	"litellm_params": { # params for litellm completion/embedding call 
+		"model": "azure/chatgpt-functioncalling", 
+		"api_key": os.getenv("AZURE_API_KEY"),
+		"api_version": os.getenv("AZURE_API_VERSION"),
+		"api_base": os.getenv("AZURE_API_BASE")
+	},
+    "tpm": 100000,
+	"rpm": 1000,
+}, {
+    "model_name": "gpt-3.5-turbo", 
+	"litellm_params": { # params for litellm completion/embedding call 
+		"model": "gpt-3.5-turbo", 
+		"api_key": os.getenv("OPENAI_API_KEY"),
+	},
+    "tpm": 100000,
+	"rpm": 1000,
+}]
+router = Router(model_list=model_list, 
+                redis_host=os.environ["REDIS_HOST"], 
+				redis_password=os.environ["REDIS_PASSWORD"], 
+				redis_port=os.environ["REDIS_PORT"], 
+                routing_strategy="usage-based-routing-v2" # 👈 KEY CHANGE
+				enable_pre_call_check=True, # enables router rate limits for concurrent calls
+				)
+
+response = await router.acompletion(model="gpt-3.5-turbo", 
+				messages=[{"role": "user", "content": "Hey, how's it going?"}]
+
+print(response)
+```
+</TabItem>
+<TabItem value="proxy" label="proxy">
+
+**1. Set strategy in config**
+
+```yaml
+model_list:
+	- model_name: gpt-3.5-turbo # model alias 
+	  litellm_params: # params for litellm completion/embedding call 
+		model: azure/chatgpt-v-2 # actual model name
+		api_key: os.environ/AZURE_API_KEY
+		api_version: os.environ/AZURE_API_VERSION
+		api_base: os.environ/AZURE_API_BASE
+      tpm: 100000
+	  rpm: 10000
+	- model_name: gpt-3.5-turbo 
+	  litellm_params: # params for litellm completion/embedding call 
+		model: gpt-3.5-turbo 
+		api_key: os.getenv(OPENAI_API_KEY)
+      tpm: 100000
+	  rpm: 1000
+
+router_settings:
+  routing_strategy: usage-based-routing-v2 # 👈 KEY CHANGE
+  redis_host: <your-redis-host>
+  redis_password: <your-redis-password>
+  redis_port: <your-redis-port>
+  enable_pre_call_check: true
+
+general_settings:
+  master_key: sk-1234
+```
+
+**2. Start proxy**
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+**3. Test it!**
+
+```bash
+curl --location 'http://localhost:4000/v1/chat/completions' \
+--header 'Content-Type: application/json' \
+--header 'Authorization: Bearer sk-1234' \
+--data '{
+    "model": "gpt-3.5-turbo", 
+    "messages": [{"role": "user", "content": "Hey, how's it going?"}]
+}'
+```
+
+</TabItem>
+</Tabs>
+
+
+</TabItem>
 <TabItem value="latency-based" label="Latency-Based">


@ -117,7 +234,10 @@ import asyncio
 model_list = [{ ... }]

 # init router
-router = Router(model_list=model_list, routing_strategy="latency-based-routing") # 👈 set routing strategy
+router = Router(model_list=model_list,
+				routing_strategy="latency-based-routing",# 👈 set routing strategy
+				enable_pre_call_check=True, # enables router rate limits for concurrent calls
+				)

 ## CALL 1+2
 tasks = []
@ -257,8 +377,9 @@ router = Router(model_list=model_list,
                redis_host=os.environ["REDIS_HOST"], 
 				redis_password=os.environ["REDIS_PASSWORD"], 
 				redis_port=os.environ["REDIS_PORT"], 
-                routing_strategy="usage-based-routing")
-
+                routing_strategy="usage-based-routing"
+				enable_pre_call_check=True, # enables router rate limits for concurrent calls
+				)

 response = await router.acompletion(model="gpt-3.5-turbo", 
 				messages=[{"role": "user", "content": "Hey, how's it going?"}]
@ -555,7 +676,11 @@ router = Router(model_list: Optional[list] = None,

 ## Pre-Call Checks (Context Window)

-Enable pre-call checks to filter out deployments with context window limit < messages for a call.
+Enable pre-call checks to filter out:
+1. deployments with context window limit < messages for a call.
+2. deployments that have exceeded rate limits when making concurrent calls. (eg. `asyncio.gather(*[
+        router.acompletion(model="gpt-3.5-turbo", messages=m) for m in list_of_messages
+    ])`)

 <Tabs>
 <TabItem value="sdk" label="SDK">
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -36,6 +36,7 @@ const sidebars = {
          label: "📖 All Endpoints (Swagger)",
          href: "https://litellm-api.up.railway.app/",
        },
+        "proxy/demo",
        "proxy/configs",
        "proxy/reliability",
        "proxy/users",
@ -163,7 +164,6 @@ const sidebars = {
        "debugging/local_debugging",
        "observability/callbacks",
        "observability/custom_callback",
-        "observability/lunary_integration",
        "observability/langfuse_integration",
        "observability/sentry",
        "observability/promptlayer_integration",
@ -171,6 +171,7 @@ const sidebars = {
        "observability/langsmith_integration",
        "observability/slack_integration",
        "observability/traceloop_integration",
+        "observability/lunary_integration",
        "observability/athina_integration",
        "observability/helicone_integration",
        "observability/supabase_integration",
--- a/enterprise/enterprise_hooks/llm_guard.py
+++ b/enterprise/enterprise_hooks/llm_guard.py
@ -95,7 +95,7 @@ class _ENTERPRISE_LLMGuard(CustomLogger):
            traceback.print_exc()
            raise e

-    def should_proceed(self, user_api_key_dict: UserAPIKeyAuth) -> bool:
+    def should_proceed(self, user_api_key_dict: UserAPIKeyAuth, data: dict) -> bool:
        if self.llm_guard_mode == "key-specific":
            # check if llm guard enabled for specific keys only
            self.print_verbose(
@ -108,6 +108,15 @@ class _ENTERPRISE_LLMGuard(CustomLogger):
                return True
        elif self.llm_guard_mode == "all":
            return True
+        elif self.llm_guard_mode == "request-specific":
+            self.print_verbose(f"received metadata: {data.get('metadata', {})}")
+            metadata = data.get("metadata", {})
+            permissions = metadata.get("permissions", {})
+            if (
+                "enable_llm_guard_check" in permissions
+                and permissions["enable_llm_guard_check"] == True
+            ):
+                return True
        return False

    async def async_moderation_hook(
@ -126,7 +135,7 @@ class _ENTERPRISE_LLMGuard(CustomLogger):
            f"Inside LLM Guard Pre-Call Hook - llm_guard_mode={self.llm_guard_mode}"
        )

-        _proceed = self.should_proceed(user_api_key_dict=user_api_key_dict)
+        _proceed = self.should_proceed(user_api_key_dict=user_api_key_dict, data=data)
        if _proceed == False:
            return

--- a/litellm/init.py
+++ b/litellm/init.py
@ -3,7 +3,11 @@ import threading, requests, os
 from typing import Callable, List, Optional, Dict, Union, Any, Literal
 from litellm.caching import Cache
 from litellm._logging import set_verbose, _turn_on_debug, verbose_logger
-from litellm.proxy._types import KeyManagementSystem, KeyManagementSettings
+from litellm.proxy._types import (
+    KeyManagementSystem,
+    KeyManagementSettings,
+    LiteLLM_UpperboundKeyGenerateParams,
+)
 import httpx
 import dotenv

@ -64,7 +68,7 @@ google_moderation_confidence_threshold: Optional[float] = None
 llamaguard_unsafe_content_categories: Optional[str] = None
 blocked_user_list: Optional[Union[str, List]] = None
 banned_keywords_list: Optional[Union[str, List]] = None
-llm_guard_mode: Literal["all", "key-specific"] = "all"
+llm_guard_mode: Literal["all", "key-specific", "request-specific"] = "all"
 ##################
 logging: bool = True
 caching: bool = (
@ -172,7 +176,7 @@ dynamodb_table_name: Optional[str] = None
 s3_callback_params: Optional[Dict] = None
 generic_logger_headers: Optional[Dict] = None
 default_key_generate_params: Optional[Dict] = None
-upperbound_key_generate_params: Optional[Dict] = None
+upperbound_key_generate_params: Optional[LiteLLM_UpperboundKeyGenerateParams] = None
 default_user_params: Optional[Dict] = None
 default_team_settings: Optional[List] = None
 max_user_budget: Optional[float] = None
--- a/litellm/caching.py
+++ b/litellm/caching.py
@ -81,9 +81,30 @@ class InMemoryCache(BaseCache):
            return cached_response
        return None

+    def batch_get_cache(self, keys: list, **kwargs):
+        return_val = []
+        for k in keys:
+            val = self.get_cache(key=k, **kwargs)
+            return_val.append(val)
+        return return_val
+
    async def async_get_cache(self, key, **kwargs):
        return self.get_cache(key=key, **kwargs)

+    async def async_batch_get_cache(self, keys: list, **kwargs):
+        return_val = []
+        for k in keys:
+            val = self.get_cache(key=k, **kwargs)
+            return_val.append(val)
+        return return_val
+
+    async def async_increment(self, key, value: int, **kwargs) -> int:
+        # get the value
+        init_value = await self.async_get_cache(key=key) or 0
+        value = init_value + value
+        await self.async_set_cache(key, value, **kwargs)
+        return value
+
    def flush_cache(self):
        self.cache_dict.clear()
        self.ttl_dict.clear()
@ -246,6 +267,21 @@ class RedisCache(BaseCache):
        if len(self.redis_batch_writing_buffer) >= self.redis_flush_size:
            await self.flush_cache_buffer()

+    async def async_increment(self, key, value: int, **kwargs) -> int:
+        _redis_client = self.init_async_client()
+        try:
+            async with _redis_client as redis_client:
+                result = await redis_client.incr(name=key, amount=value)
+                return result
+        except Exception as e:
+            verbose_logger.error(
+                "LiteLLM Redis Caching: async async_increment() - Got exception from REDIS %s, Writing value=%s",
+                str(e),
+                value,
+            )
+            traceback.print_exc()
+            raise e
+
    async def flush_cache_buffer(self):
        print_verbose(
            f"flushing to redis....reached size of buffer {len(self.redis_batch_writing_buffer)}"
@ -283,6 +319,32 @@ class RedisCache(BaseCache):
            traceback.print_exc()
            logging.debug("LiteLLM Caching: get() - Got exception from REDIS: ", e)

+    def batch_get_cache(self, key_list) -> dict:
+        """
+        Use Redis for bulk read operations
+        """
+        key_value_dict = {}
+        try:
+            _keys = []
+            for cache_key in key_list:
+                cache_key = self.check_and_fix_namespace(key=cache_key)
+                _keys.append(cache_key)
+            results = self.redis_client.mget(keys=_keys)
+
+            # Associate the results back with their keys.
+            # 'results' is a list of values corresponding to the order of keys in 'key_list'.
+            key_value_dict = dict(zip(key_list, results))
+
+            decoded_results = {
+                k.decode("utf-8"): self._get_cache_logic(v)
+                for k, v in key_value_dict.items()
+            }
+
+            return decoded_results
+        except Exception as e:
+            print_verbose(f"Error occurred in pipeline read - {str(e)}")
+            return key_value_dict
+
    async def async_get_cache(self, key, **kwargs):
        _redis_client = self.init_async_client()
        key = self.check_and_fix_namespace(key=key)
@ -301,7 +363,7 @@ class RedisCache(BaseCache):
                    f"LiteLLM Caching: async get() - Got exception from REDIS: {str(e)}"
                )

-    async def async_get_cache_pipeline(self, key_list) -> dict:
+    async def async_batch_get_cache(self, key_list) -> dict:
        """
        Use Redis for bulk read operations
        """
@ -309,14 +371,11 @@ class RedisCache(BaseCache):
        key_value_dict = {}
        try:
            async with _redis_client as redis_client:
-                async with redis_client.pipeline(transaction=True) as pipe:
-                    # Queue the get operations in the pipeline for all keys.
-                    for cache_key in key_list:
-                        cache_key = self.check_and_fix_namespace(key=cache_key)
-                        pipe.get(cache_key)  # Queue GET command in pipeline
-
-                    # Execute the pipeline and await the results.
-                    results = await pipe.execute()
+                _keys = []
+                for cache_key in key_list:
+                    cache_key = self.check_and_fix_namespace(key=cache_key)
+                    _keys.append(cache_key)
+                results = await redis_client.mget(keys=_keys)

            # Associate the results back with their keys.
            # 'results' is a list of values corresponding to the order of keys in 'key_list'.
@ -897,6 +956,39 @@ class DualCache(BaseCache):
        except Exception as e:
            traceback.print_exc()

+    def batch_get_cache(self, keys: list, local_only: bool = False, **kwargs):
+        try:
+            result = [None for _ in range(len(keys))]
+            if self.in_memory_cache is not None:
+                in_memory_result = self.in_memory_cache.batch_get_cache(keys, **kwargs)
+
+                print_verbose(f"in_memory_result: {in_memory_result}")
+                if in_memory_result is not None:
+                    result = in_memory_result
+
+            if None in result and self.redis_cache is not None and local_only == False:
+                """
+                - for the none values in the result
+                - check the redis cache
+                """
+                sublist_keys = [
+                    key for key, value in zip(keys, result) if value is None
+                ]
+                # If not found in in-memory cache, try fetching from Redis
+                redis_result = self.redis_cache.batch_get_cache(sublist_keys, **kwargs)
+                if redis_result is not None:
+                    # Update in-memory cache with the value from Redis
+                    for key in redis_result:
+                        self.in_memory_cache.set_cache(key, redis_result[key], **kwargs)
+
+                for key, value in redis_result.items():
+                    result[sublist_keys.index(key)] = value
+
+            print_verbose(f"async batch get cache: cache result: {result}")
+            return result
+        except Exception as e:
+            traceback.print_exc()
+
    async def async_get_cache(self, key, local_only: bool = False, **kwargs):
        # Try to fetch from in-memory cache first
        try:
@ -930,6 +1022,50 @@ class DualCache(BaseCache):
        except Exception as e:
            traceback.print_exc()

+    async def async_batch_get_cache(
+        self, keys: list, local_only: bool = False, **kwargs
+    ):
+        try:
+            result = [None for _ in range(len(keys))]
+            if self.in_memory_cache is not None:
+                in_memory_result = await self.in_memory_cache.async_batch_get_cache(
+                    keys, **kwargs
+                )
+
+                print_verbose(f"in_memory_result: {in_memory_result}")
+                if in_memory_result is not None:
+                    result = in_memory_result
+
+            if None in result and self.redis_cache is not None and local_only == False:
+                """
+                - for the none values in the result
+                - check the redis cache
+                """
+                sublist_keys = [
+                    key for key, value in zip(keys, result) if value is None
+                ]
+                # If not found in in-memory cache, try fetching from Redis
+                redis_result = await self.redis_cache.async_batch_get_cache(
+                    sublist_keys, **kwargs
+                )
+
+                if redis_result is not None:
+                    # Update in-memory cache with the value from Redis
+                    for key in redis_result:
+                        await self.in_memory_cache.async_set_cache(
+                            key, redis_result[key], **kwargs
+                        )
+
+                sublist_dict = dict(zip(sublist_keys, redis_result))
+
+                for key, value in sublist_dict.items():
+                    result[sublist_keys.index(key)] = value
+
+            print_verbose(f"async batch get cache: cache result: {result}")
+            return result
+        except Exception as e:
+            traceback.print_exc()
+
    async def async_set_cache(self, key, value, local_only: bool = False, **kwargs):
        try:
            if self.in_memory_cache is not None:
@ -941,6 +1077,32 @@ class DualCache(BaseCache):
            print_verbose(f"LiteLLM Cache: Excepton async add_cache: {str(e)}")
            traceback.print_exc()

+    async def async_increment_cache(
+        self, key, value: int, local_only: bool = False, **kwargs
+    ) -> int:
+        """
+        Key - the key in cache
+
+        Value - int - the value you want to increment by
+
+        Returns - int - the incremented value
+        """
+        try:
+            result: int = value
+            if self.in_memory_cache is not None:
+                result = await self.in_memory_cache.async_increment(
+                    key, value, **kwargs
+                )
+
+            if self.redis_cache is not None and local_only == False:
+                result = await self.redis_cache.async_increment(key, value, **kwargs)
+
+            return result
+        except Exception as e:
+            print_verbose(f"LiteLLM Cache: Excepton async add_cache: {str(e)}")
+            traceback.print_exc()
+            raise e
+
    def flush_cache(self):
        if self.in_memory_cache is not None:
            self.in_memory_cache.flush_cache()
--- a/litellm/integrations/langfuse.py
+++ b/litellm/integrations/langfuse.py
@ -161,7 +161,7 @@ class LangFuseLogger:
            verbose_logger.info(f"Langfuse Layer Logging - logging success")
        except:
            traceback.print_exc()
-            print(f"Langfuse Layer Error - {traceback.format_exc()}")
+            verbose_logger.debug(f"Langfuse Layer Error - {traceback.format_exc()}")
            pass

    async def _async_log_event(
@ -190,7 +190,7 @@ class LangFuseLogger:
    ):
        from langfuse.model import CreateTrace, CreateGeneration

-        print(
+        verbose_logger.warning(
            "Please upgrade langfuse to v2.0.0 or higher: https://github.com/langfuse/langfuse-python/releases/tag/v2.0.1"
        )

@ -247,7 +247,6 @@ class LangFuseLogger:

            print_verbose(f"Langfuse Layer Logging - logging to langfuse v2 ")

-            print(f"response_obj: {response_obj}")
            if supports_tags:
                metadata_tags = metadata.get("tags", [])
                tags = metadata_tags
@ -312,13 +311,11 @@ class LangFuseLogger:
            usage = None
            if response_obj is not None and response_obj.get("id", None) is not None:
                generation_id = litellm.utils.get_logging_id(start_time, response_obj)
-                print(f"getting usage, cost={cost}")
                usage = {
                    "prompt_tokens": response_obj["usage"]["prompt_tokens"],
                    "completion_tokens": response_obj["usage"]["completion_tokens"],
                    "total_cost": cost if supports_costs else None,
                }
-                print(f"constructed usage - {usage}")
            generation_name = metadata.get("generation_name", None)
            if generation_name is None:
                # just log `litellm-{call_type}` as the generation name
@ -351,4 +348,4 @@ class LangFuseLogger:

            trace.generation(**generation_params)
        except Exception as e:
-            print(f"Langfuse Layer Error - {traceback.format_exc()}")
+            verbose_logger.debug(f"Langfuse Layer Error - {traceback.format_exc()}")
--- a/litellm/integrations/langsmith.py
+++ b/litellm/integrations/langsmith.py
@ -53,6 +53,8 @@ class LangsmithLogger:
                value = kwargs[key]
                if key == "start_time" or key == "end_time":
                    pass
+                elif type(value) == datetime.datetime:
+                    new_kwargs[key] = value.isoformat()
                elif type(value) != dict:
                    new_kwargs[key] = value

--- a/litellm/llms/anthropic.py
+++ b/litellm/llms/anthropic.py
@ -7,7 +7,8 @@ from typing import Callable, Optional, List
 from litellm.utils import ModelResponse, Usage, map_finish_reason, CustomStreamWrapper
 import litellm
 from .prompt_templates.factory import prompt_factory, custom_prompt
-
+from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
+from .base import BaseLLM
 import httpx


@ -15,6 +16,8 @@ class AnthropicConstants(Enum):
    HUMAN_PROMPT = "\n\nHuman: "
    AI_PROMPT = "\n\nAssistant: "

+    # constants from https://github.com/anthropics/anthropic-sdk-python/blob/main/src/anthropic/_constants.py
+

 class AnthropicError(Exception):
    def __init__(self, status_code, message):
@ -36,7 +39,9 @@ class AnthropicConfig:
    to pass metadata to anthropic, it's {"user_id": "any-relevant-information"}
    """

-    max_tokens: Optional[int] = 4096  # anthropic requires a default value (Opus, Sonnet, and Haiku have the same default) 
+    max_tokens: Optional[int] = (
+        4096  # anthropic requires a default value (Opus, Sonnet, and Haiku have the same default)
+    )
    stop_sequences: Optional[list] = None
    temperature: Optional[int] = None
    top_p: Optional[int] = None
@ -46,7 +51,9 @@ class AnthropicConfig:

    def __init__(
        self,
-        max_tokens: Optional[int] = 4096,  # You can pass in a value yourself or use the default value 4096
+        max_tokens: Optional[
+            int
+        ] = 4096,  # You can pass in a value yourself or use the default value 4096
        stop_sequences: Optional[list] = None,
        temperature: Optional[int] = None,
        top_p: Optional[int] = None,
@ -95,121 +102,23 @@ def validate_environment(api_key, user_headers):
    return headers


-def completion(
-    model: str,
-    messages: list,
-    api_base: str,
-    custom_prompt_dict: dict,
-    model_response: ModelResponse,
-    print_verbose: Callable,
-    encoding,
-    api_key,
-    logging_obj,
-    optional_params=None,
-    litellm_params=None,
-    logger_fn=None,
-    headers={},
-):
-    headers = validate_environment(api_key, headers)
-    _is_function_call = False
-    messages = copy.deepcopy(messages)
-    optional_params = copy.deepcopy(optional_params)
-    if model in custom_prompt_dict:
-        # check if the model has a registered custom prompt
-        model_prompt_details = custom_prompt_dict[model]
-        prompt = custom_prompt(
-            role_dict=model_prompt_details["roles"],
-            initial_prompt_value=model_prompt_details["initial_prompt_value"],
-            final_prompt_value=model_prompt_details["final_prompt_value"],
-            messages=messages,
-        )
-    else:
-        # Separate system prompt from rest of message
-        system_prompt_indices = []
-        system_prompt = ""
-        for idx, message in enumerate(messages):
-            if message["role"] == "system":
-                system_prompt += message["content"]
-                system_prompt_indices.append(idx)
-        if len(system_prompt_indices) > 0:
-            for idx in reversed(system_prompt_indices):
-                messages.pop(idx)
-        if len(system_prompt) > 0:
-            optional_params["system"] = system_prompt
-        # Format rest of message according to anthropic guidelines
-        try:
-            messages = prompt_factory(
-                model=model, messages=messages, custom_llm_provider="anthropic"
-            )
-        except Exception as e:
-            raise AnthropicError(status_code=400, message=str(e))
-
-    ## Load Config
-    config = litellm.AnthropicConfig.get_config()
-    for k, v in config.items():
-        if (
-            k not in optional_params
-        ):  # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
-            optional_params[k] = v
-
-    ## Handle Tool Calling
-    if "tools" in optional_params:
-        _is_function_call = True
-        headers["anthropic-beta"] = "tools-2024-04-04"
-
-        anthropic_tools = []
-        for tool in optional_params["tools"]:
-            new_tool = tool["function"]
-            new_tool["input_schema"] = new_tool.pop("parameters")  # rename key
-            anthropic_tools.append(new_tool)
-
-        optional_params["tools"] = anthropic_tools
-
-    stream = optional_params.pop("stream", None)
-
-    data = {
-        "model": model,
-        "messages": messages,
-        **optional_params,
-    }
-
-    ## LOGGING
-    logging_obj.pre_call(
-        input=messages,
-        api_key=api_key,
-        additional_args={
-            "complete_input_dict": data,
-            "api_base": api_base,
-            "headers": headers,
-        },
-    )
-    print_verbose(f"_is_function_call: {_is_function_call}")
-    ## COMPLETION CALL
-    if (
-        stream and not _is_function_call
-    ):  # if function call - fake the streaming (need complete blocks for output parsing in openai format)
-        print_verbose("makes anthropic streaming POST request")
-        data["stream"] = stream
-        response = requests.post(
-            api_base,
-            headers=headers,
-            data=json.dumps(data),
-            stream=stream,
-        )
-
-        if response.status_code != 200:
-            raise AnthropicError(
-                status_code=response.status_code, message=response.text
-            )
-
-        return response.iter_lines()
-    else:
-        response = requests.post(api_base, headers=headers, data=json.dumps(data))
-        if response.status_code != 200:
-            raise AnthropicError(
-                status_code=response.status_code, message=response.text
-            )
+class AnthropicChatCompletion(BaseLLM):
+    def __init__(self) -> None:
+        super().__init__()

+    def process_response(
+        self,
+        model,
+        response,
+        model_response,
+        _is_function_call,
+        stream,
+        logging_obj,
+        api_key,
+        data,
+        messages,
+        print_verbose,
+    ):
        ## LOGGING
        logging_obj.post_call(
            input=messages,
@ -327,6 +236,272 @@ def completion(
        model_response.usage = usage
        return model_response

+    async def acompletion_stream_function(
+        self,
+        model: str,
+        messages: list,
+        api_base: str,
+        custom_prompt_dict: dict,
+        model_response: ModelResponse,
+        print_verbose: Callable,
+        encoding,
+        api_key,
+        logging_obj,
+        stream,
+        _is_function_call,
+        data=None,
+        optional_params=None,
+        litellm_params=None,
+        logger_fn=None,
+        headers={},
+    ):
+        self.async_handler = AsyncHTTPHandler(
+            timeout=httpx.Timeout(timeout=600.0, connect=5.0)
+        )
+        response = await self.async_handler.post(
+            api_base, headers=headers, data=json.dumps(data)
+        )
+
+        if response.status_code != 200:
+            raise AnthropicError(
+                status_code=response.status_code, message=response.text
+            )
+
+        completion_stream = response.aiter_lines()
+
+        streamwrapper = CustomStreamWrapper(
+            completion_stream=completion_stream,
+            model=model,
+            custom_llm_provider="anthropic",
+            logging_obj=logging_obj,
+        )
+        return streamwrapper
+
+    async def acompletion_function(
+        self,
+        model: str,
+        messages: list,
+        api_base: str,
+        custom_prompt_dict: dict,
+        model_response: ModelResponse,
+        print_verbose: Callable,
+        encoding,
+        api_key,
+        logging_obj,
+        stream,
+        _is_function_call,
+        data=None,
+        optional_params=None,
+        litellm_params=None,
+        logger_fn=None,
+        headers={},
+    ):
+        self.async_handler = AsyncHTTPHandler(
+            timeout=httpx.Timeout(timeout=600.0, connect=5.0)
+        )
+        response = await self.async_handler.post(
+            api_base, headers=headers, data=json.dumps(data)
+        )
+        return self.process_response(
+            model=model,
+            response=response,
+            model_response=model_response,
+            _is_function_call=_is_function_call,
+            stream=stream,
+            logging_obj=logging_obj,
+            api_key=api_key,
+            data=data,
+            messages=messages,
+            print_verbose=print_verbose,
+        )
+
+    def completion(
+        self,
+        model: str,
+        messages: list,
+        api_base: str,
+        custom_prompt_dict: dict,
+        model_response: ModelResponse,
+        print_verbose: Callable,
+        encoding,
+        api_key,
+        logging_obj,
+        optional_params=None,
+        acompletion=None,
+        litellm_params=None,
+        logger_fn=None,
+        headers={},
+    ):
+        headers = validate_environment(api_key, headers)
+        _is_function_call = False
+        messages = copy.deepcopy(messages)
+        optional_params = copy.deepcopy(optional_params)
+        if model in custom_prompt_dict:
+            # check if the model has a registered custom prompt
+            model_prompt_details = custom_prompt_dict[model]
+            prompt = custom_prompt(
+                role_dict=model_prompt_details["roles"],
+                initial_prompt_value=model_prompt_details["initial_prompt_value"],
+                final_prompt_value=model_prompt_details["final_prompt_value"],
+                messages=messages,
+            )
+        else:
+            # Separate system prompt from rest of message
+            system_prompt_indices = []
+            system_prompt = ""
+            for idx, message in enumerate(messages):
+                if message["role"] == "system":
+                    system_prompt += message["content"]
+                    system_prompt_indices.append(idx)
+            if len(system_prompt_indices) > 0:
+                for idx in reversed(system_prompt_indices):
+                    messages.pop(idx)
+            if len(system_prompt) > 0:
+                optional_params["system"] = system_prompt
+            # Format rest of message according to anthropic guidelines
+            try:
+                messages = prompt_factory(
+                    model=model, messages=messages, custom_llm_provider="anthropic"
+                )
+            except Exception as e:
+                raise AnthropicError(status_code=400, message=str(e))
+
+        ## Load Config
+        config = litellm.AnthropicConfig.get_config()
+        for k, v in config.items():
+            if (
+                k not in optional_params
+            ):  # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
+                optional_params[k] = v
+
+        ## Handle Tool Calling
+        if "tools" in optional_params:
+            _is_function_call = True
+            headers["anthropic-beta"] = "tools-2024-04-04"
+
+            anthropic_tools = []
+            for tool in optional_params["tools"]:
+                new_tool = tool["function"]
+                new_tool["input_schema"] = new_tool.pop("parameters")  # rename key
+                anthropic_tools.append(new_tool)
+
+            optional_params["tools"] = anthropic_tools
+
+        stream = optional_params.pop("stream", None)
+
+        data = {
+            "model": model,
+            "messages": messages,
+            **optional_params,
+        }
+
+        ## LOGGING
+        logging_obj.pre_call(
+            input=messages,
+            api_key=api_key,
+            additional_args={
+                "complete_input_dict": data,
+                "api_base": api_base,
+                "headers": headers,
+            },
+        )
+        print_verbose(f"_is_function_call: {_is_function_call}")
+        if acompletion == True:
+            if (
+                stream and not _is_function_call
+            ):  # if function call - fake the streaming (need complete blocks for output parsing in openai format)
+                print_verbose("makes async anthropic streaming POST request")
+                data["stream"] = stream
+                return self.acompletion_stream_function(
+                    model=model,
+                    messages=messages,
+                    data=data,
+                    api_base=api_base,
+                    custom_prompt_dict=custom_prompt_dict,
+                    model_response=model_response,
+                    print_verbose=print_verbose,
+                    encoding=encoding,
+                    api_key=api_key,
+                    logging_obj=logging_obj,
+                    optional_params=optional_params,
+                    stream=stream,
+                    _is_function_call=_is_function_call,
+                    litellm_params=litellm_params,
+                    logger_fn=logger_fn,
+                    headers=headers,
+                )
+            else:
+                return self.acompletion_function(
+                    model=model,
+                    messages=messages,
+                    data=data,
+                    api_base=api_base,
+                    custom_prompt_dict=custom_prompt_dict,
+                    model_response=model_response,
+                    print_verbose=print_verbose,
+                    encoding=encoding,
+                    api_key=api_key,
+                    logging_obj=logging_obj,
+                    optional_params=optional_params,
+                    stream=stream,
+                    _is_function_call=_is_function_call,
+                    litellm_params=litellm_params,
+                    logger_fn=logger_fn,
+                    headers=headers,
+                )
+        else:
+            ## COMPLETION CALL
+            if (
+                stream and not _is_function_call
+            ):  # if function call - fake the streaming (need complete blocks for output parsing in openai format)
+                print_verbose("makes anthropic streaming POST request")
+                data["stream"] = stream
+                response = requests.post(
+                    api_base,
+                    headers=headers,
+                    data=json.dumps(data),
+                    stream=stream,
+                )
+
+                if response.status_code != 200:
+                    raise AnthropicError(
+                        status_code=response.status_code, message=response.text
+                    )
+
+                completion_stream = response.iter_lines()
+                streaming_response = CustomStreamWrapper(
+                    completion_stream=completion_stream,
+                    model=model,
+                    custom_llm_provider="anthropic",
+                    logging_obj=logging_obj,
+                )
+                return streaming_response
+
+            else:
+                response = requests.post(
+                    api_base, headers=headers, data=json.dumps(data)
+                )
+                if response.status_code != 200:
+                    raise AnthropicError(
+                        status_code=response.status_code, message=response.text
+                    )
+        return self.process_response(
+            model=model,
+            response=response,
+            model_response=model_response,
+            _is_function_call=_is_function_call,
+            stream=stream,
+            logging_obj=logging_obj,
+            api_key=api_key,
+            data=data,
+            messages=messages,
+            print_verbose=print_verbose,
+        )
+
+    def embedding(self):
+        # logic for parsing in - calling - parsing out model embedding calls
+        pass
+

 class ModelResponseIterator:
    def __init__(self, model_response):
@ -352,8 +527,3 @@ class ModelResponseIterator:
            raise StopAsyncIteration
        self.is_done = True
        return self.model_response
-
-
-def embedding():
-    # logic for parsing in - calling - parsing out model embedding calls
-    pass
--- a/litellm/llms/anthropic_text.py
+++ b/litellm/llms/anthropic_text.py
@ -4,7 +4,7 @@ from enum import Enum
 import requests
 import time
 from typing import Callable, Optional
-from litellm.utils import ModelResponse, Usage
+from litellm.utils import ModelResponse, Usage, CustomStreamWrapper
 import litellm
 from .prompt_templates.factory import prompt_factory, custom_prompt
 import httpx
@ -162,8 +162,15 @@ def completion(
            raise AnthropicError(
                status_code=response.status_code, message=response.text
            )
+        completion_stream = response.iter_lines()
+        stream_response = CustomStreamWrapper(
+            completion_stream=completion_stream,
+            model=model,
+            custom_llm_provider="anthropic",
+            logging_obj=logging_obj,
+        )
+        return stream_response

-        return response.iter_lines()
    else:
        response = requests.post(api_base, headers=headers, data=json.dumps(data))
        if response.status_code != 200:
--- a/litellm/llms/custom_httpx/http_handler.py
+++ b/litellm/llms/custom_httpx/http_handler.py
@ -1,21 +1,34 @@
 import httpx, asyncio
-from typing import Optional
+from typing import Optional, Union, Mapping, Any
+
+# https://www.python-httpx.org/advanced/timeouts
+_DEFAULT_TIMEOUT = httpx.Timeout(timeout=5.0, connect=5.0)


 class AsyncHTTPHandler:
-    def __init__(self, concurrent_limit=1000):
+    def __init__(
+        self, timeout: httpx.Timeout = _DEFAULT_TIMEOUT, concurrent_limit=1000
+    ):
        # Create a client with a connection pool
        self.client = httpx.AsyncClient(
+            timeout=timeout,
            limits=httpx.Limits(
                max_connections=concurrent_limit,
                max_keepalive_connections=concurrent_limit,
-            )
+            ),
        )

    async def close(self):
        # Close the client when you're done with it
        await self.client.aclose()

+    async def __aenter__(self):
+        return self.client
+
+    async def __aexit__(self):
+        # close the client when exiting
+        await self.client.aclose()
+
    async def get(
        self, url: str, params: Optional[dict] = None, headers: Optional[dict] = None
    ):
@ -25,12 +38,15 @@ class AsyncHTTPHandler:
    async def post(
        self,
        url: str,
-        data: Optional[dict] = None,
+        data: Optional[Union[dict, str]] = None,  # type: ignore
        params: Optional[dict] = None,
        headers: Optional[dict] = None,
    ):
        response = await self.client.post(
-            url, data=data, params=params, headers=headers
+            url,
+            data=data,  # type: ignore
+            params=params,
+            headers=headers,
        )
        return response

--- a/litellm/llms/gemini.py
+++ b/litellm/llms/gemini.py
@ -6,7 +6,8 @@ from typing import Callable, Optional
 from litellm.utils import ModelResponse, get_secret, Choices, Message, Usage
 import litellm
 import sys, httpx
-from .prompt_templates.factory import prompt_factory, custom_prompt
+from .prompt_templates.factory import prompt_factory, custom_prompt, get_system_prompt
+from packaging.version import Version


 class GeminiError(Exception):
@ -103,6 +104,13 @@ class TextStreamer:
                break


+def supports_system_instruction():
+    import google.generativeai as genai
+
+    gemini_pkg_version = Version(genai.__version__)
+    return gemini_pkg_version >= Version("0.5.0")
+
+
 def completion(
    model: str,
    messages: list,
@ -124,7 +132,7 @@ def completion(
            "Importing google.generativeai failed, please run 'pip install -q google-generativeai"
        )
    genai.configure(api_key=api_key)
-
+    system_prompt = ""
    if model in custom_prompt_dict:
        # check if the model has a registered custom prompt
        model_prompt_details = custom_prompt_dict[model]
@ -135,6 +143,7 @@ def completion(
            messages=messages,
        )
    else:
+        system_prompt, messages = get_system_prompt(messages=messages)
        prompt = prompt_factory(
            model=model, messages=messages, custom_llm_provider="gemini"
        )
@ -162,11 +171,20 @@ def completion(
    logging_obj.pre_call(
        input=prompt,
        api_key="",
-        additional_args={"complete_input_dict": {"inference_params": inference_params}},
+        additional_args={
+            "complete_input_dict": {
+                "inference_params": inference_params,
+                "system_prompt": system_prompt,
+            }
+        },
    )
    ## COMPLETION CALL
    try:
-        _model = genai.GenerativeModel(f"models/{model}")
+        _params = {"model_name": "models/{}".format(model)}
+        _system_instruction = supports_system_instruction()
+        if _system_instruction and len(system_prompt) > 0:
+            _params["system_instruction"] = system_prompt
+        _model = genai.GenerativeModel(**_params)
        if stream == True:
            if acompletion == True:

@ -213,11 +231,12 @@ def completion(
                encoding=encoding,
            )
        else:
-            response = _model.generate_content(
-                contents=prompt,
-                generation_config=genai.types.GenerationConfig(**inference_params),
-                safety_settings=safety_settings,
-            )
+            params = {
+                "contents": prompt,
+                "generation_config": genai.types.GenerationConfig(**inference_params),
+                "safety_settings": safety_settings,
+            }
+            response = _model.generate_content(**params)
    except Exception as e:
        raise GeminiError(
            message=str(e),
--- a/litellm/llms/ollama.py
+++ b/litellm/llms/ollama.py
@ -254,7 +254,7 @@ def get_ollama_response(
    model_response["created"] = int(time.time())
    model_response["model"] = "ollama/" + model
    prompt_tokens = response_json.get("prompt_eval_count", len(encoding.encode(prompt)))  # type: ignore
-    completion_tokens = response_json["eval_count"]
+    completion_tokens = response_json.get("eval_count", len(response_json.get("message",dict()).get("content", "")))
    model_response["usage"] = litellm.Usage(
        prompt_tokens=prompt_tokens,
        completion_tokens=completion_tokens,
@ -356,7 +356,7 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
            model_response["created"] = int(time.time())
            model_response["model"] = "ollama/" + data["model"]
            prompt_tokens = response_json.get("prompt_eval_count", len(encoding.encode(data["prompt"])))  # type: ignore
-            completion_tokens = response_json["eval_count"]
+            completion_tokens = response_json.get("eval_count", len(response_json.get("message",dict()).get("content", "")))
            model_response["usage"] = litellm.Usage(
                prompt_tokens=prompt_tokens,
                completion_tokens=completion_tokens,
--- a/litellm/llms/prompt_templates/factory.py
+++ b/litellm/llms/prompt_templates/factory.py
@ -1,9 +1,9 @@
 from enum import Enum
 import requests, traceback
 import json, re, xml.etree.ElementTree as ET
-from jinja2 import Template, exceptions, Environment, meta
+from jinja2 import Template, exceptions, meta, BaseLoader
+from jinja2.sandbox import ImmutableSandboxedEnvironment
 from typing import Optional, Any
-import imghdr, base64
 from typing import List
 import litellm

@ -219,6 +219,15 @@ def phind_codellama_pt(messages):


 def hf_chat_template(model: str, messages: list, chat_template: Optional[Any] = None):
+    # Define Jinja2 environment
+    env = ImmutableSandboxedEnvironment()
+
+    def raise_exception(message):
+        raise Exception(f"Error message - {message}")
+
+    # Create a template object from the template text
+    env.globals["raise_exception"] = raise_exception
+
    ## get the tokenizer config from huggingface
    bos_token = ""
    eos_token = ""
@ -249,12 +258,6 @@ def hf_chat_template(model: str, messages: list, chat_template: Optional[Any] =
        eos_token = tokenizer_config["eos_token"]
        chat_template = tokenizer_config["chat_template"]

-    def raise_exception(message):
-        raise Exception(f"Error message - {message}")
-
-    # Create a template object from the template text
-    env = Environment()
-    env.globals["raise_exception"] = raise_exception
    try:
        template = env.from_string(chat_template)
    except Exception as e:
@ -959,7 +962,20 @@ def parse_xml_params(xml_content, json_schema: Optional[dict] = None):
    return params


-###
+### GEMINI HELPER FUNCTIONS ###
+
+
+def get_system_prompt(messages):
+    system_prompt_indices = []
+    system_prompt = ""
+    for idx, message in enumerate(messages):
+        if message["role"] == "system":
+            system_prompt += message["content"]
+            system_prompt_indices.append(idx)
+    if len(system_prompt_indices) > 0:
+        for idx in reversed(system_prompt_indices):
+            messages.pop(idx)
+    return system_prompt, messages


 def convert_openai_message_to_cohere_tool_result(message):
--- a/litellm/llms/replicate.py
+++ b/litellm/llms/replicate.py
@ -332,9 +332,12 @@ def completion(
            model_response["choices"][0]["message"]["content"] = result

        # Calculate usage
-        prompt_tokens = len(encoding.encode(prompt))
+        prompt_tokens = len(encoding.encode(prompt, disallowed_special=()))
        completion_tokens = len(
-            encoding.encode(model_response["choices"][0]["message"].get("content", ""))
+            encoding.encode(
+                model_response["choices"][0]["message"].get("content", ""),
+                disallowed_special=(),
+            )
        )
        model_response["model"] = "replicate/" + model
        usage = Usage(
--- a/litellm/llms/vertex_ai.py
+++ b/litellm/llms/vertex_ai.py
@ -3,10 +3,10 @@ import json
 from enum import Enum
 import requests
 import time
-from typing import Callable, Optional, Union
+from typing import Callable, Optional, Union, List
 from litellm.utils import ModelResponse, Usage, CustomStreamWrapper, map_finish_reason
 import litellm, uuid
-import httpx
+import httpx, inspect


 class VertexAIError(Exception):
@ -25,6 +25,7 @@ class VertexAIError(Exception):
 class VertexAIConfig:
    """
    Reference: https://cloud.google.com/vertex-ai/docs/generative-ai/chat/test-chat-prompts
+    Reference: https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference

    The class `VertexAIConfig` provides configuration for the VertexAI's API interface. Below are the parameters:

@ -36,6 +37,12 @@ class VertexAIConfig:

    - `top_k` (integer): The value of `top_k` determines how many of the most probable tokens are considered in the selection. For example, a `top_k` of 1 means the selected token is the most probable among all tokens. The default value is 40.

+    - `response_mime_type` (str): The MIME type of the response. The default value is 'text/plain'.
+
+    - `candidate_count` (int): Number of generated responses to return.
+
+    - `stop_sequences` (List[str]): The set of character sequences (up to 5) that will stop output generation. If specified, the API will stop at the first appearance of a stop sequence. The stop sequence will not be included as part of the response.
+
    Note: Please make sure to modify the default parameters as required for your use case.
    """

@ -43,6 +50,9 @@ class VertexAIConfig:
    max_output_tokens: Optional[int] = None
    top_p: Optional[float] = None
    top_k: Optional[int] = None
+    response_mime_type: Optional[str] = None
+    candidate_count: Optional[int] = None
+    stop_sequences: Optional[list] = None

    def __init__(
        self,
@ -50,6 +60,9 @@ class VertexAIConfig:
        max_output_tokens: Optional[int] = None,
        top_p: Optional[float] = None,
        top_k: Optional[int] = None,
+        response_mime_type: Optional[str] = None,
+        candidate_count: Optional[int] = None,
+        stop_sequences: Optional[list] = None,
    ) -> None:
        locals_ = locals()
        for key, value in locals_.items():
@ -295,6 +308,42 @@ def completion(
        from google.cloud.aiplatform_v1beta1.types import content as gapic_content_types  # type: ignore
        import google.auth  # type: ignore

+        class ExtendedGenerationConfig(GenerationConfig):
+            """Extended parameters for the generation."""
+
+            def __init__(
+                self,
+                *,
+                temperature: Optional[float] = None,
+                top_p: Optional[float] = None,
+                top_k: Optional[int] = None,
+                candidate_count: Optional[int] = None,
+                max_output_tokens: Optional[int] = None,
+                stop_sequences: Optional[List[str]] = None,
+                response_mime_type: Optional[str] = None,
+            ):
+                args_spec = inspect.getfullargspec(gapic_content_types.GenerationConfig)
+
+                if "response_mime_type" in args_spec.args:
+                    self._raw_generation_config = gapic_content_types.GenerationConfig(
+                        temperature=temperature,
+                        top_p=top_p,
+                        top_k=top_k,
+                        candidate_count=candidate_count,
+                        max_output_tokens=max_output_tokens,
+                        stop_sequences=stop_sequences,
+                        response_mime_type=response_mime_type,
+                    )
+                else:
+                    self._raw_generation_config = gapic_content_types.GenerationConfig(
+                        temperature=temperature,
+                        top_p=top_p,
+                        top_k=top_k,
+                        candidate_count=candidate_count,
+                        max_output_tokens=max_output_tokens,
+                        stop_sequences=stop_sequences,
+                    )
+
        ## Load credentials with the correct quota project ref: https://github.com/googleapis/python-aiplatform/issues/2557#issuecomment-1709284744
        print_verbose(
            f"VERTEX AI: vertex_project={vertex_project}; vertex_location={vertex_location}"
@ -417,7 +466,7 @@ def completion(
            return async_completion(**data)

        if mode == "vision":
-            print_verbose("\nMaking VertexAI Gemini Pro Vision Call")
+            print_verbose("\nMaking VertexAI Gemini Pro / Pro Vision Call")
            print_verbose(f"\nProcessing input messages = {messages}")
            tools = optional_params.pop("tools", None)
            prompt, images = _gemini_vision_convert_messages(messages=messages)
@ -436,7 +485,7 @@ def completion(

                model_response = llm_model.generate_content(
                    contents=content,
-                    generation_config=GenerationConfig(**optional_params),
+                    generation_config=ExtendedGenerationConfig(**optional_params),
                    safety_settings=safety_settings,
                    stream=True,
                    tools=tools,
@ -458,7 +507,7 @@ def completion(
            ## LLM Call
            response = llm_model.generate_content(
                contents=content,
-                generation_config=GenerationConfig(**optional_params),
+                generation_config=ExtendedGenerationConfig(**optional_params),
                safety_settings=safety_settings,
                tools=tools,
            )
@ -698,6 +747,43 @@ async def async_completion(
    """
    try:
        from vertexai.preview.generative_models import GenerationConfig
+        from google.cloud.aiplatform_v1beta1.types import content as gapic_content_types  # type: ignore
+
+        class ExtendedGenerationConfig(GenerationConfig):
+            """Extended parameters for the generation."""
+
+            def __init__(
+                self,
+                *,
+                temperature: Optional[float] = None,
+                top_p: Optional[float] = None,
+                top_k: Optional[int] = None,
+                candidate_count: Optional[int] = None,
+                max_output_tokens: Optional[int] = None,
+                stop_sequences: Optional[List[str]] = None,
+                response_mime_type: Optional[str] = None,
+            ):
+                args_spec = inspect.getfullargspec(gapic_content_types.GenerationConfig)
+
+                if "response_mime_type" in args_spec.args:
+                    self._raw_generation_config = gapic_content_types.GenerationConfig(
+                        temperature=temperature,
+                        top_p=top_p,
+                        top_k=top_k,
+                        candidate_count=candidate_count,
+                        max_output_tokens=max_output_tokens,
+                        stop_sequences=stop_sequences,
+                        response_mime_type=response_mime_type,
+                    )
+                else:
+                    self._raw_generation_config = gapic_content_types.GenerationConfig(
+                        temperature=temperature,
+                        top_p=top_p,
+                        top_k=top_k,
+                        candidate_count=candidate_count,
+                        max_output_tokens=max_output_tokens,
+                        stop_sequences=stop_sequences,
+                    )

        if mode == "vision":
            print_verbose("\nMaking VertexAI Gemini Pro Vision Call")
@ -721,7 +807,7 @@ async def async_completion(
            ## LLM Call
            response = await llm_model._generate_content_async(
                contents=content,
-                generation_config=GenerationConfig(**optional_params),
+                generation_config=ExtendedGenerationConfig(**optional_params),
                tools=tools,
            )

@ -906,6 +992,43 @@ async def async_streaming(
    Add support for async streaming calls for gemini-pro
    """
    from vertexai.preview.generative_models import GenerationConfig
+    from google.cloud.aiplatform_v1beta1.types import content as gapic_content_types  # type: ignore
+
+    class ExtendedGenerationConfig(GenerationConfig):
+        """Extended parameters for the generation."""
+
+        def __init__(
+            self,
+            *,
+            temperature: Optional[float] = None,
+            top_p: Optional[float] = None,
+            top_k: Optional[int] = None,
+            candidate_count: Optional[int] = None,
+            max_output_tokens: Optional[int] = None,
+            stop_sequences: Optional[List[str]] = None,
+            response_mime_type: Optional[str] = None,
+        ):
+            args_spec = inspect.getfullargspec(gapic_content_types.GenerationConfig)
+
+            if "response_mime_type" in args_spec.args:
+                self._raw_generation_config = gapic_content_types.GenerationConfig(
+                    temperature=temperature,
+                    top_p=top_p,
+                    top_k=top_k,
+                    candidate_count=candidate_count,
+                    max_output_tokens=max_output_tokens,
+                    stop_sequences=stop_sequences,
+                    response_mime_type=response_mime_type,
+                )
+            else:
+                self._raw_generation_config = gapic_content_types.GenerationConfig(
+                    temperature=temperature,
+                    top_p=top_p,
+                    top_k=top_k,
+                    candidate_count=candidate_count,
+                    max_output_tokens=max_output_tokens,
+                    stop_sequences=stop_sequences,
+                )

    if mode == "vision":
        stream = optional_params.pop("stream")
@ -927,7 +1050,7 @@ async def async_streaming(

        response = await llm_model._generate_content_streaming_async(
            contents=content,
-            generation_config=GenerationConfig(**optional_params),
+            generation_config=ExtendedGenerationConfig(**optional_params),
            tools=tools,
        )
        optional_params["stream"] = True
--- a/litellm/main.py
+++ b/litellm/main.py
@ -39,7 +39,6 @@ from litellm.utils import (
    get_optional_params_image_gen,
 )
 from .llms import (
-    anthropic,
    anthropic_text,
    together_ai,
    ai21,
@ -68,6 +67,7 @@ from .llms import (
 from .llms.openai import OpenAIChatCompletion, OpenAITextCompletion
 from .llms.azure import AzureChatCompletion
 from .llms.azure_text import AzureTextCompletion
+from .llms.anthropic import AnthropicChatCompletion
 from .llms.huggingface_restapi import Huggingface
 from .llms.prompt_templates.factory import (
    prompt_factory,
@ -99,6 +99,7 @@ from litellm.utils import (
 dotenv.load_dotenv()  # Loading env variables using dotenv
 openai_chat_completions = OpenAIChatCompletion()
 openai_text_completions = OpenAITextCompletion()
+anthropic_chat_completions = AnthropicChatCompletion()
 azure_chat_completions = AzureChatCompletion()
 azure_text_completions = AzureTextCompletion()
 huggingface = Huggingface()
@ -304,6 +305,7 @@ async def acompletion(
            or custom_llm_provider == "vertex_ai"
            or custom_llm_provider == "gemini"
            or custom_llm_provider == "sagemaker"
+            or custom_llm_provider == "anthropic"
            or custom_llm_provider in litellm.openai_compatible_providers
        ):  # currently implemented aiohttp calls for just azure, openai, hf, ollama, vertex ai soon all.
            init_response = await loop.run_in_executor(None, func_with_context)
@ -315,6 +317,14 @@ async def acompletion(
                response = await init_response
            else:
                response = init_response  # type: ignore
+
+            if custom_llm_provider == "text-completion-openai" and isinstance(
+                response, TextCompletionResponse
+            ):
+                response = litellm.OpenAITextCompletionConfig().convert_to_chat_model_response_object(
+                    response_object=response,
+                    model_response_object=litellm.ModelResponse(),
+                )
        else:
            # Call the synchronous function using run_in_executor
            response = await loop.run_in_executor(None, func_with_context)  # type: ignore
@ -1180,10 +1190,11 @@ def completion(
                    or get_secret("ANTHROPIC_API_BASE")
                    or "https://api.anthropic.com/v1/messages"
                )
-                response = anthropic.completion(
+                response = anthropic_chat_completions.completion(
                    model=model,
                    messages=messages,
                    api_base=api_base,
+                    acompletion=acompletion,
                    custom_prompt_dict=litellm.custom_prompt_dict,
                    model_response=model_response,
                    print_verbose=print_verbose,
@ -1195,19 +1206,6 @@ def completion(
                    logging_obj=logging,
                    headers=headers,
                )
-            if (
-                "stream" in optional_params
-                and optional_params["stream"] == True
-                and not isinstance(response, CustomStreamWrapper)
-            ):
-                # don't try to access stream object,
-                response = CustomStreamWrapper(
-                    response,
-                    model,
-                    custom_llm_provider="anthropic",
-                    logging_obj=logging,
-                )
-
            if optional_params.get("stream", False) or acompletion == True:
                ## LOGGING
                logging.post_call(
@ -3786,6 +3784,9 @@ async def ahealth_check(

            api_base = model_params.get("api_base") or get_secret("OPENAI_API_BASE")

+            if custom_llm_provider == "text-completion-openai":
+                mode = "completion"
+
            response = await openai_chat_completions.ahealth_check(
                model=model,
                messages=model_params.get(
@ -3819,11 +3820,15 @@ async def ahealth_check(
        return response
    except Exception as e:
        traceback.print_exc()
+        stack_trace = traceback.format_exc()
+        if isinstance(stack_trace, str):
+            stack_trace = stack_trace[:1000]
        if model not in litellm.model_cost and mode is None:
            raise Exception(
                "Missing `mode`. Set the `mode` for the model - https://docs.litellm.ai/docs/proxy/health#embedding-models"
            )
-        return {"error": f"{str(e)}"}
+        error_to_return = str(e) + " stack trace: " + stack_trace
+        return {"error": error_to_return}


 ####### HELPER FUNCTIONS ################
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -66,6 +66,28 @@
        "litellm_provider": "openai",
        "mode": "chat"
    },
+    "gpt-4-turbo": {
+        "max_tokens": 4096,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00001,
+        "output_cost_per_token": 0.00003,
+        "litellm_provider": "openai",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "supports_parallel_function_calling": true
+    },
+    "gpt-4-turbo-2024-04-09": {
+        "max_tokens": 4096,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00001,
+        "output_cost_per_token": 0.00003,
+        "litellm_provider": "openai",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "supports_parallel_function_calling": true
+    },
    "gpt-4-1106-preview": {
        "max_tokens": 4096,
        "max_input_tokens": 128000,
@ -948,6 +970,28 @@
        "supports_function_calling": true,
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
    },
+    "gemini-1.0-pro-001": { 
+        "max_tokens": 8192,
+        "max_input_tokens": 32760,
+        "max_output_tokens": 8192,
+        "input_cost_per_token": 0.00000025, 
+        "output_cost_per_token": 0.0000005,
+        "litellm_provider": "vertex_ai-language-models",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
+    },
+    "gemini-1.0-pro-002": { 
+        "max_tokens": 8192,
+        "max_input_tokens": 32760,
+        "max_output_tokens": 8192,
+        "input_cost_per_token": 0.00000025, 
+        "output_cost_per_token": 0.0000005,
+        "litellm_provider": "vertex_ai-language-models",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
+    },
    "gemini-1.5-pro": { 
        "max_tokens": 8192,
        "max_input_tokens": 1000000,
@ -970,6 +1014,17 @@
        "supports_function_calling": true,
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
    },
+    "gemini-1.5-pro-preview-0409": {
+        "max_tokens": 8192,
+        "max_input_tokens": 1000000,
+        "max_output_tokens": 8192,
+        "input_cost_per_token": 0,
+        "output_cost_per_token": 0,
+        "litellm_provider": "vertex_ai-language-models",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
+    },
    "gemini-experimental": {
        "max_tokens": 8192,
        "max_input_tokens": 1000000,
@ -2808,6 +2863,46 @@
        "output_cost_per_token": 0.000000,
        "litellm_provider": "voyage",
        "mode": "embedding"
+    },
+    "voyage/voyage-large-2": {
+        "max_tokens": 16000,
+        "max_input_tokens": 16000,
+        "input_cost_per_token": 0.00000012,
+        "output_cost_per_token": 0.000000,
+        "litellm_provider": "voyage",
+        "mode": "embedding"
+    },
+    "voyage/voyage-law-2": {
+        "max_tokens": 16000,
+        "max_input_tokens": 16000,
+        "input_cost_per_token": 0.00000012,
+        "output_cost_per_token": 0.000000,
+        "litellm_provider": "voyage",
+        "mode": "embedding"
+    },
+    "voyage/voyage-code-2": {
+        "max_tokens": 16000,
+        "max_input_tokens": 16000,
+        "input_cost_per_token": 0.00000012,
+        "output_cost_per_token": 0.000000,
+        "litellm_provider": "voyage",
+        "mode": "embedding"
+    },
+    "voyage/voyage-2": {
+        "max_tokens": 4000,
+        "max_input_tokens": 4000,
+        "input_cost_per_token": 0.0000001,
+        "output_cost_per_token": 0.000000,
+        "litellm_provider": "voyage",
+        "mode": "embedding"
+    },
+    "voyage/voyage-lite-02-instruct": {
+        "max_tokens": 4000,
+        "max_input_tokens": 4000,
+        "input_cost_per_token": 0.0000001,
+        "output_cost_per_token": 0.000000,
+        "litellm_provider": "voyage",
+        "mode": "embedding"
    }

 }
--- a/litellm/proxy/_experimental/out/404.html
+++ b/litellm/proxy/_experimental/out/404.html
--- a/litellm/proxy/_experimental/out/_next/static/BNBzATtnAelV8BpmzRdfL/_buildManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/BNBzATtnAelV8BpmzRdfL/_buildManifest.js
--- a/litellm/proxy/_experimental/out/_next/static/BNBzATtnAelV8BpmzRdfL/_ssgManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/BNBzATtnAelV8BpmzRdfL/_ssgManifest.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/823-2ada48e2e6a5ab39.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/823-2ada48e2e6a5ab39.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/layout-cea9ba1ac8ae8bd7.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/layout-cea9ba1ac8ae8bd7.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/page-a485c9c659128852.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/page-a485c9c659128852.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/page-e16bcf8bdc356530.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/page-e16bcf8bdc356530.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/webpack-11b043d6a7ef78fa.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/webpack-11b043d6a7ef78fa.js
@ -1 +1 @@
-!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function d(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={id:e,loaded:!1,exports:{}},r=!0;try{a[e](n,n.exports,d),r=!1}finally{r&&delete l[e]}return n.loaded=!0,n.exports}d.m=a,e=[],d.O=function(t,n,r,o){if(n){o=o||0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(d.O).every(function(e){return d.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},d.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return d.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},d.t=function(e,r){if(1&r&&(e=this(e)),8&r||"object"==typeof e&&e&&(4&r&&e.__esModule||16&r&&"function"==typeof e.then))return e;var o=Object.create(null);d.r(o);var u={};t=t||[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},d.d(o,u),o},d.d=function(e,t){for(var n in t)d.o(t,n)&&!d.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},d.f={},d.e=function(e){return Promise.all(Object.keys(d.f).reduce(function(t,n){return d.f[n](e,t),t},[]))},d.u=function(e){},d.miniCssF=function(e){return"static/css/04eb0ce8764f86fe.css"},d.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||Function("return this")()}catch(e){if("object"==typeof window)return window}}(),d.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",d.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e||l.getAttribute("data-webpack")==o+n){i=l;break}}i||(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,d.nc&&i.setAttribute("nonce",d.nc),i.setAttribute("data-webpack",o+n),i.src=d.tu(e)),r[e]=[t];var s=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(s.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=s.bind(null,i.onerror),i.onload=s.bind(null,i.onload),c&&document.head.appendChild(i)},d.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.nmd=function(e){return e.paths=[],e.children||(e.children=[]),e},d.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},d.tu=function(e){return d.tt().createScriptURL(e)},d.p="/ui/_next/",i={272:0},d.f.j=function(e,t){var n=d.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=d.p+d.u(e),u=Error();d.l(o,function(t){if(d.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},d.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)d.o(u,n)&&(d.m[n]=u[n]);if(c)var a=c(d)}for(e&&e(t);f<o.length;f++)r=o[f],d.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return d.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E||[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();
+!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function d(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={id:e,loaded:!1,exports:{}},r=!0;try{a[e](n,n.exports,d),r=!1}finally{r&&delete l[e]}return n.loaded=!0,n.exports}d.m=a,e=[],d.O=function(t,n,r,o){if(n){o=o||0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(d.O).every(function(e){return d.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},d.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return d.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},d.t=function(e,r){if(1&r&&(e=this(e)),8&r||"object"==typeof e&&e&&(4&r&&e.__esModule||16&r&&"function"==typeof e.then))return e;var o=Object.create(null);d.r(o);var u={};t=t||[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},d.d(o,u),o},d.d=function(e,t){for(var n in t)d.o(t,n)&&!d.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},d.f={},d.e=function(e){return Promise.all(Object.keys(d.f).reduce(function(t,n){return d.f[n](e,t),t},[]))},d.u=function(e){},d.miniCssF=function(e){return"static/css/a282d1bfd6ed4df8.css"},d.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||Function("return this")()}catch(e){if("object"==typeof window)return window}}(),d.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",d.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e||l.getAttribute("data-webpack")==o+n){i=l;break}}i||(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,d.nc&&i.setAttribute("nonce",d.nc),i.setAttribute("data-webpack",o+n),i.src=d.tu(e)),r[e]=[t];var s=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(s.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=s.bind(null,i.onerror),i.onload=s.bind(null,i.onload),c&&document.head.appendChild(i)},d.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.nmd=function(e){return e.paths=[],e.children||(e.children=[]),e},d.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},d.tu=function(e){return d.tt().createScriptURL(e)},d.p="/ui/_next/",i={272:0},d.f.j=function(e,t){var n=d.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=d.p+d.u(e),u=Error();d.l(o,function(t){if(d.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},d.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)d.o(u,n)&&(d.m[n]=u[n]);if(c)var a=c(d)}for(e&&e(t);f<o.length;f++)r=o[f],d.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return d.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E||[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();
--- a/litellm/proxy/_experimental/out/_next/static/css/04eb0ce8764f86fe.css
+++ b/litellm/proxy/_experimental/out/_next/static/css/04eb0ce8764f86fe.css
--- a/litellm/proxy/_experimental/out/_next/static/css/a282d1bfd6ed4df8.css
+++ b/litellm/proxy/_experimental/out/_next/static/css/a282d1bfd6ed4df8.css
--- a/litellm/proxy/_experimental/out/index.html
+++ b/litellm/proxy/_experimental/out/index.html
@ -1 +1 @@
-<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-68f14392aea51f63.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-a507ee9e75a3be72.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-589b47e7a69d316f.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-096338c8e1915716.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-68f14392aea51f63.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/04eb0ce8764f86fe.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[46502,[\"253\",\"static/chunks/253-8ab6133ad5f92675.js\",\"931\",\"static/chunks/app/page-a485c9c659128852.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/04eb0ce8764f86fe.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"KnyD0lgLk9_a0erHwSSu-\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
+<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-11b043d6a7ef78fa.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-a507ee9e75a3be72.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-589b47e7a69d316f.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-096338c8e1915716.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-11b043d6a7ef78fa.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/a282d1bfd6ed4df8.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[29306,[\"823\",\"static/chunks/823-2ada48e2e6a5ab39.js\",\"931\",\"static/chunks/app/page-e16bcf8bdc356530.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/a282d1bfd6ed4df8.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"BNBzATtnAelV8BpmzRdfL\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
--- a/litellm/proxy/_experimental/out/index.txt
+++ b/litellm/proxy/_experimental/out/index.txt
@ -1,7 +1,7 @@
 2:I[77831,[],""]
-3:I[46502,["253","static/chunks/253-8ab6133ad5f92675.js","931","static/chunks/app/page-a485c9c659128852.js"],""]
+3:I[29306,["823","static/chunks/823-2ada48e2e6a5ab39.js","931","static/chunks/app/page-e16bcf8bdc356530.js"],""]
 4:I[5613,[],""]
 5:I[31778,[],""]
-0:["KnyD0lgLk9_a0erHwSSu-",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/04eb0ce8764f86fe.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
+0:["BNBzATtnAelV8BpmzRdfL",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/a282d1bfd6ed4df8.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
 6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
 1:null
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@ -5,6 +5,7 @@ model_list:
    api_key: my-fake-key
    api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
    stream_timeout: 0.001
+    rpm: 10
 - litellm_params:
      model: azure/chatgpt-v-2
      api_base: os.environ/AZURE_API_BASE
@ -12,30 +13,38 @@ model_list:
      api_version: "2023-07-01-preview"
      stream_timeout: 0.001
  model_name: azure-gpt-3.5
+# - model_name: text-embedding-ada-002
+#   litellm_params:
+#     model: text-embedding-ada-002
+#     api_key: os.environ/OPENAI_API_KEY
 - model_name: gpt-instruct
  litellm_params:
-    model: gpt-3.5-turbo-instruct
+    model: text-completion-openai/gpt-3.5-turbo-instruct
    # api_key: my-fake-key
    # api_base: https://exampleopenaiendpoint-production.up.railway.app/

 litellm_settings:
  success_callback: ["prometheus"]
-  
-# litellm_settings:
-#   drop_params: True
-#   max_budget: 800021
-#   budget_duration: 30d
-#   # cache: true
-  
+  upperbound_key_generate_params: 
+    max_budget: os.environ/LITELLM_UPPERBOUND_KEYS_MAX_BUDGET
+
+router_settings:
+  routing_strategy: usage-based-routing-v2 
+  redis_host: os.environ/REDIS_HOST
+  redis_password: os.environ/REDIS_PASSWORD
+  redis_port: os.environ/REDIS_PORT
+  enable_pre_call_checks: True

 general_settings:
  master_key: sk-1234
  allow_user_auth: true
  alerting: ["slack"]
-  store_model_in_db: True
-  # proxy_batch_write_at: 60 # 👈 Frequency of batch writing logs to server (in seconds)
+  # store_model_in_db: True // set via environment variable - os.environ["STORE_MODEL_IN_DB"] = "True"
+  proxy_batch_write_at: 5 # 👈 Frequency of batch writing logs to server (in seconds)
  enable_jwt_auth: True
  alerting: ["slack"]
  litellm_jwtauth:
    admin_jwt_scope: "litellm_proxy_admin"
-    public_key_ttl: 600
+    public_key_ttl: os.environ/LITELLM_PUBLIC_KEY_TTL
+    user_id_jwt_field: "sub"
+    org_id_jwt_field: "azp"
--- a/litellm/proxy/_types.py
+++ b/litellm/proxy/_types.py
@ -38,6 +38,18 @@ class LiteLLMBase(BaseModel):
        protected_namespaces = ()


+class LiteLLM_UpperboundKeyGenerateParams(LiteLLMBase):
+    """
+    Set default upperbound to max budget a key called via `/key/generate` can be.
+    """
+
+    max_budget: Optional[float] = None
+    budget_duration: Optional[str] = None
+    max_parallel_requests: Optional[int] = None
+    tpm_limit: Optional[int] = None
+    rpm_limit: Optional[int] = None
+
+
 class LiteLLMRoutes(enum.Enum):
    openai_routes: List = [  # chat completions
        "/openai/deployments/{model}/chat/completions",
@ -112,7 +124,8 @@ class LiteLLM_JWTAuth(LiteLLMBase):
    - team_jwt_scope: The JWT scope required for proxy team roles.
    - team_id_jwt_field: The field in the JWT token that stores the team ID. Default - `client_id`.
    - team_allowed_routes: list of allowed routes for proxy team roles.
-    - end_user_id_jwt_field: Default - `sub`. The field in the JWT token that stores the end-user ID. Turn this off by setting to `None`. Enables end-user cost tracking.
+    - user_id_jwt_field: The field in the JWT token that stores the user id (maps to `LiteLLMUserTable`). Use this for internal employees.
+    - end_user_id_jwt_field: The field in the JWT token that stores the end-user ID (maps to `LiteLLMEndUserTable`). Turn this off by setting to `None`. Enables end-user cost tracking. Use this for external customers.
    - public_key_ttl: Default - 600s. TTL for caching public JWT keys.

    See `auth_checks.py` for the specific routes
@ -127,7 +140,9 @@ class LiteLLM_JWTAuth(LiteLLMBase):
    team_allowed_routes: List[
        Literal["openai_routes", "info_routes", "management_routes"]
    ] = ["openai_routes", "info_routes"]
-    end_user_id_jwt_field: Optional[str] = "sub"
+    org_id_jwt_field: Optional[str] = None
+    user_id_jwt_field: Optional[str] = None
+    end_user_id_jwt_field: Optional[str] = None
    public_key_ttl: float = 600

    def __init__(self, **kwargs: Any) -> None:
@ -363,6 +378,8 @@ class NewUserRequest(GenerateKeyRequest):
    max_budget: Optional[float] = None
    user_email: Optional[str] = None
    user_role: Optional[str] = None
+    teams: Optional[list] = None
+    organization_id: Optional[str] = None
    auto_create_key: bool = (
        True  # flag used for returning a key as part of the /user/new response
    )
@ -498,6 +515,7 @@ class LiteLLM_BudgetTable(LiteLLMBase):


 class NewOrganizationRequest(LiteLLM_BudgetTable):
+    organization_id: Optional[str] = None
    organization_alias: str
    models: List = []
    budget_id: Optional[str] = None
@ -506,6 +524,7 @@ class NewOrganizationRequest(LiteLLM_BudgetTable):
 class LiteLLM_OrganizationTable(LiteLLMBase):
    """Represents user-controllable params for a LiteLLM_OrganizationTable record"""

+    organization_id: Optional[str] = None
    organization_alias: Optional[str] = None
    budget_id: str
    metadata: Optional[dict] = None
@ -690,6 +709,8 @@ class LiteLLM_VerificationToken(LiteLLMBase):
    soft_budget_cooldown: bool = False
    litellm_budget_table: Optional[dict] = None

+    org_id: Optional[str] = None  # org id for a given key
+
    # hidden params used for parallel request limiting, not required to create a token
    user_id_rate_limits: Optional[dict] = None
    team_id_rate_limits: Optional[dict] = None
--- a/litellm/proxy/auth/auth_checks.py
+++ b/litellm/proxy/auth/auth_checks.py
@ -14,6 +14,7 @@ from litellm.proxy._types import (
    LiteLLM_JWTAuth,
    LiteLLM_TeamTable,
    LiteLLMRoutes,
+    LiteLLM_OrganizationTable,
 )
 from typing import Optional, Literal, Union
 from litellm.proxy.utils import PrismaClient
@ -26,6 +27,7 @@ all_routes = LiteLLMRoutes.openai_routes.value + LiteLLMRoutes.management_routes
 def common_checks(
    request_body: dict,
    team_object: LiteLLM_TeamTable,
+    user_object: Optional[LiteLLM_UserTable],
    end_user_object: Optional[LiteLLM_EndUserTable],
    global_proxy_spend: Optional[float],
    general_settings: dict,
@ -37,7 +39,8 @@ def common_checks(
    1. If team is blocked
    2. If team can call model
    3. If team is in budget
-    4. If end_user ('user' passed to /chat/completions, /embeddings endpoint) is in budget
+    5. If user passed in (JWT or key.user_id) - is in budget
+    4. If end_user (either via JWT or 'user' passed to /chat/completions, /embeddings endpoint) is in budget
    5. [OPTIONAL] If 'enforce_end_user' enabled - did developer pass in 'user' param for openai endpoints
    6. [OPTIONAL] If 'litellm.max_budget' is set (>0), is proxy under budget
    """
@ -69,14 +72,20 @@ def common_checks(
        raise Exception(
            f"Team={team_object.team_id} over budget. Spend={team_object.spend}, Budget={team_object.max_budget}"
        )
-    # 4. If end_user ('user' passed to /chat/completions, /embeddings endpoint) is in budget
+    if user_object is not None and user_object.max_budget is not None:
+        user_budget = user_object.max_budget
+        if user_budget > user_object.spend:
+            raise Exception(
+                f"ExceededBudget: User={user_object.user_id} over budget. Spend={user_object.spend}, Budget={user_budget}"
+            )
+    # 5. If end_user ('user' passed to /chat/completions, /embeddings endpoint) is in budget
    if end_user_object is not None and end_user_object.litellm_budget_table is not None:
        end_user_budget = end_user_object.litellm_budget_table.max_budget
        if end_user_budget is not None and end_user_object.spend > end_user_budget:
            raise Exception(
                f"ExceededBudget: End User={end_user_object.user_id} over budget. Spend={end_user_object.spend}, Budget={end_user_budget}"
            )
-    # 5. [OPTIONAL] If 'enforce_user_param' enabled - did developer pass in 'user' param for openai endpoints
+    # 6. [OPTIONAL] If 'enforce_user_param' enabled - did developer pass in 'user' param for openai endpoints
    if (
        general_settings.get("enforce_user_param", None) is not None
        and general_settings["enforce_user_param"] == True
@ -85,7 +94,7 @@ def common_checks(
            raise Exception(
                f"'user' param not passed in. 'enforce_user_param'={general_settings['enforce_user_param']}"
            )
-    # 6. [OPTIONAL] If 'litellm.max_budget' is set (>0), is proxy under budget
+    # 7. [OPTIONAL] If 'litellm.max_budget' is set (>0), is proxy under budget
    if litellm.max_budget > 0 and global_proxy_spend is not None:
        if global_proxy_spend > litellm.max_budget:
            raise Exception(
@ -204,19 +213,24 @@ async def get_end_user_object(
        return None


-async def get_user_object(self, user_id: str) -> LiteLLM_UserTable:
+async def get_user_object(
+    user_id: str,
+    prisma_client: Optional[PrismaClient],
+    user_api_key_cache: DualCache,
+) -> Optional[LiteLLM_UserTable]:
    """
    - Check if user id in proxy User Table
    - if valid, return LiteLLM_UserTable object with defined limits
    - if not, then raise an error
    """
-    if self.prisma_client is None:
-        raise Exception(
-            "No DB Connected. See - https://docs.litellm.ai/docs/proxy/virtual_keys"
-        )
+    if prisma_client is None:
+        raise Exception("No db connected")
+
+    if user_id is None:
+        return None

    # check if in cache
-    cached_user_obj = self.user_api_key_cache.async_get_cache(key=user_id)
+    cached_user_obj = user_api_key_cache.async_get_cache(key=user_id)
    if cached_user_obj is not None:
        if isinstance(cached_user_obj, dict):
            return LiteLLM_UserTable(**cached_user_obj)
@ -224,7 +238,7 @@ async def get_user_object(self, user_id: str) -> LiteLLM_UserTable:
            return cached_user_obj
    # else, check db
    try:
-        response = await self.prisma_client.db.litellm_usertable.find_unique(
+        response = await prisma_client.db.litellm_usertable.find_unique(
            where={"user_id": user_id}
        )

@ -232,9 +246,9 @@ async def get_user_object(self, user_id: str) -> LiteLLM_UserTable:
            raise Exception

        return LiteLLM_UserTable(**response.dict())
-    except Exception as e:
+    except Exception as e:  # if end-user not in db
        raise Exception(
-            f"User doesn't exist in db. User={user_id}. Create user via `/user/new` call."
+            f"User doesn't exist in db. 'user_id'={user_id}. Create user via `/user/new` call."
        )


@ -274,3 +288,41 @@ async def get_team_object(
        raise Exception(
            f"Team doesn't exist in db. Team={team_id}. Create team via `/team/new` call."
        )
+
+
+async def get_org_object(
+    org_id: str,
+    prisma_client: Optional[PrismaClient],
+    user_api_key_cache: DualCache,
+):
+    """
+    - Check if org id in proxy Org Table
+    - if valid, return LiteLLM_OrganizationTable object
+    - if not, then raise an error
+    """
+    if prisma_client is None:
+        raise Exception(
+            "No DB Connected. See - https://docs.litellm.ai/docs/proxy/virtual_keys"
+        )
+
+    # check if in cache
+    cached_org_obj = user_api_key_cache.async_get_cache(key="org_id:{}".format(org_id))
+    if cached_org_obj is not None:
+        if isinstance(cached_org_obj, dict):
+            return cached_org_obj
+        elif isinstance(cached_org_obj, LiteLLM_OrganizationTable):
+            return cached_org_obj
+    # else, check db
+    try:
+        response = await prisma_client.db.litellm_organizationtable.find_unique(
+            where={"organization_id": org_id}
+        )
+
+        if response is None:
+            raise Exception
+
+        return response
+    except Exception as e:
+        raise Exception(
+            f"Organization doesn't exist in db. Organization={org_id}. Create organization via `/organization/new` call."
+        )
--- a/litellm/proxy/auth/handle_jwt.py
+++ b/litellm/proxy/auth/handle_jwt.py
@ -74,6 +74,26 @@ class JWTHandler:
            team_id = default_value
        return team_id

+    def get_user_id(self, token: dict, default_value: Optional[str]) -> Optional[str]:
+        try:
+            if self.litellm_jwtauth.user_id_jwt_field is not None:
+                user_id = token[self.litellm_jwtauth.user_id_jwt_field]
+            else:
+                user_id = None
+        except KeyError:
+            user_id = default_value
+        return user_id
+
+    def get_org_id(self, token: dict, default_value: Optional[str]) -> Optional[str]:
+        try:
+            if self.litellm_jwtauth.org_id_jwt_field is not None:
+                org_id = token[self.litellm_jwtauth.org_id_jwt_field]
+            else:
+                org_id = None
+        except KeyError:
+            org_id = default_value
+        return org_id
+
    def get_scopes(self, token: dict) -> list:
        try:
            if isinstance(token["scope"], str):
@ -101,7 +121,11 @@ class JWTHandler:
        if cached_keys is None:
            response = await self.http_handler.get(keys_url)

-            keys = response.json()["keys"]
+            response_json = response.json()
+            if "keys" in response_json:
+                keys = response.json()["keys"]
+            else:
+                keys = response_json

            await self.user_api_key_cache.async_set_cache(
                key="litellm_jwt_auth_keys",
--- a/litellm/proxy/hooks/batch_redis_get.py
+++ b/litellm/proxy/hooks/batch_redis_get.py
@ -79,7 +79,7 @@ class _PROXY_BatchRedisRequests(CustomLogger):
                    self.print_verbose(f"redis keys: {keys}")
                    if len(keys) > 0:
                        key_value_dict = (
-                            await litellm.cache.cache.async_get_cache_pipeline(
+                            await litellm.cache.cache.async_batch_get_cache(
                                key_list=keys
                            )
                        )
--- a/litellm/proxy/proxy_cli.py
+++ b/litellm/proxy/proxy_cli.py
@ -425,9 +425,10 @@ def run_server(
                )

            proxy_config = ProxyConfig()
-            _, _, general_settings = asyncio.run(
-                proxy_config.load_config(router=None, config_file_path=config)
-            )
+            _config = asyncio.run(proxy_config.get_config(config_file_path=config))
+            general_settings = _config.get("general_settings", {})
+            if general_settings is None:
+                general_settings = {}
            database_url = general_settings.get("database_url", None)
            db_connection_pool_limit = general_settings.get(
                "database_connection_pool_limit", 100
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@ -1,49 +1,9 @@
 model_list:
-
-  - model_name: gpt-3.5-turbo
-    litellm_params:
-      model: azure/chatgpt-v-2
-      api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
-      api_version: "2023-05-15"
-      api_key: os.environ/AZURE_API_KEY # The `os.environ/` prefix tells litellm to read this from the env. See https://docs.litellm.ai/docs/simple_proxy#load-api-keys-from-vault
-  - model_name: gpt-3.5-turbo-large
-    litellm_params: 
-      model: "gpt-3.5-turbo-1106"
-      api_key: os.environ/OPENAI_API_KEY
-  - model_name: gpt-4
-    litellm_params:
-      model: azure/chatgpt-v-2
-      api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
-      api_version: "2023-05-15"
-      api_key: os.environ/AZURE_API_KEY # The `os.environ/` prefix tells litellm to read this from the env. See https://docs.litellm.ai/docs/simple_proxy#load-api-keys-from-vault
-  - model_name: sagemaker-completion-model
-    litellm_params:
-      model: sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4
-      input_cost_per_second: 0.000420  
-  - model_name: text-embedding-ada-002
-    litellm_params: 
-      model: azure/azure-embedding-model
-      api_key: os.environ/AZURE_API_KEY
-      api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
-      api_version: "2023-05-15"
-    model_info:
-      mode: embedding
-      base_model: text-embedding-ada-002
-  - model_name: dall-e-2
-    litellm_params:
-      model: azure/
-      api_version: 2023-06-01-preview
-      api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
-      api_key: os.environ/AZURE_API_KEY
-  - model_name: openai-dall-e-3
-    litellm_params:
-      model: dall-e-3
  - model_name: fake-openai-endpoint
    litellm_params:
      model: openai/fake
      api_key: fake-key
      api_base: https://exampleopenaiendpoint-production.up.railway.app/
-litellm_settings:
-  success_callback: ["prometheus"]
 general_settings:
+  store_model_in_db: true
  master_key: sk-1234
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
--- a/litellm/proxy/schema.prisma
+++ b/litellm/proxy/schema.prisma
@ -53,6 +53,7 @@ model LiteLLM_OrganizationTable {
    updated_by String
    litellm_budget_table LiteLLM_BudgetTable?   @relation(fields: [budget_id], references: [budget_id])
    teams LiteLLM_TeamTable[] 
+    users LiteLLM_UserTable[]
 }

 // Model info for teams, just has model aliases for now.
@ -99,6 +100,7 @@ model LiteLLM_UserTable {
 		user_id    String @id
    user_alias String? 
    team_id    String?
+    organization_id String?
    teams    String[] @default([])
    user_role  String?
 		max_budget Float?
@ -113,6 +115,7 @@ model LiteLLM_UserTable {
    allowed_cache_controls String[] @default([])
    model_spend      Json @default("{}")
    model_max_budget Json @default("{}")
+    litellm_organization_table LiteLLM_OrganizationTable?   @relation(fields: [organization_id], references: [organization_id])
 }

 // Generate Tokens for Proxy
--- a/litellm/proxy/tests/test_openai_embedding.py
+++ b/litellm/proxy/tests/test_openai_embedding.py
@ -0,0 +1,126 @@
+import openai
+import asyncio
+
+
+async def async_request(client, model, input_data):
+    response = await client.embeddings.create(model=model, input=input_data)
+    response = response.dict()
+    data_list = response["data"]
+    for i, embedding in enumerate(data_list):
+        embedding["embedding"] = []
+        current_index = embedding["index"]
+        assert i == current_index
+    return response
+
+
+async def main():
+    client = openai.AsyncOpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000")
+    models = [
+        "text-embedding-ada-002",
+        "text-embedding-ada-002",
+        "text-embedding-ada-002",
+    ]
+    inputs = [
+        [
+            "5",
+            "6",
+            "7",
+            "8",
+            "9",
+            "10",
+            "11",
+            "12",
+            "13",
+            "14",
+            "15",
+            "16",
+            "17",
+            "18",
+            "19",
+            "20",
+        ],
+        ["1", "2", "3", "4", "5", "6"],
+        [
+            "1",
+            "2",
+            "3",
+            "4",
+            "5",
+            "6",
+            "7",
+            "8",
+            "9",
+            "10",
+            "11",
+            "12",
+            "13",
+            "14",
+            "15",
+            "16",
+            "17",
+            "18",
+            "19",
+            "20",
+        ],
+        [
+            "1",
+            "2",
+            "3",
+            "4",
+            "5",
+            "6",
+            "7",
+            "8",
+            "9",
+            "10",
+            "11",
+            "12",
+            "13",
+            "14",
+            "15",
+            "16",
+            "17",
+            "18",
+            "19",
+            "20",
+        ],
+        [
+            "1",
+            "2",
+            "3",
+            "4",
+            "5",
+            "6",
+            "7",
+            "8",
+            "9",
+            "10",
+            "11",
+            "12",
+            "13",
+            "14",
+            "15",
+            "16",
+            "17",
+            "18",
+            "19",
+            "20",
+        ],
+        ["1", "2", "3"],
+    ]
+
+    tasks = []
+    for model, input_data in zip(models, inputs):
+        task = async_request(client, model, input_data)
+        tasks.append(task)
+
+    responses = await asyncio.gather(*tasks)
+    print(responses)
+    for response in responses:
+        data_list = response["data"]
+        for embedding in data_list:
+            embedding["embedding"] = []
+        print(response)
+
+
+asyncio.run(main())
--- a/litellm/proxy/tests/test_openai_simple_embedding.py
+++ b/litellm/proxy/tests/test_openai_simple_embedding.py
@ -0,0 +1,10 @@
+import openai
+
+client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000")
+
+# # request sent to model set on litellm proxy, `litellm --model`
+response = client.embeddings.create(
+    model="text-embedding-ada-002", input=["test"], encoding_format="base64"
+)
+
+print(response)
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@ -461,7 +461,12 @@ class ProxyLogging:
        """
        ### ALERTING ###
        if isinstance(original_exception, HTTPException):
-            error_message = original_exception.detail
+            if isinstance(original_exception.detail, str):
+                error_message = original_exception.detail
+            elif isinstance(original_exception.detail, dict):
+                error_message = json.dumps(original_exception.detail)
+            else:
+                error_message = str(original_exception)
        else:
            error_message = str(original_exception)
        if isinstance(traceback_str, str):
@ -562,6 +567,7 @@ class PrismaClient:
    end_user_list_transactons: dict = {}
    key_list_transactons: dict = {}
    team_list_transactons: dict = {}
+    org_list_transactons: dict = {}
    spend_log_transactions: List = []

    def __init__(self, database_url: str, proxy_logging_obj: ProxyLogging):
@ -1159,13 +1165,26 @@ class PrismaClient:
                return new_verification_token
            elif table_name == "user":
                db_data = self.jsonify_object(data=data)
-                new_user_row = await self.db.litellm_usertable.upsert(
-                    where={"user_id": data["user_id"]},
-                    data={
-                        "create": {**db_data},  # type: ignore
-                        "update": {},  # don't do anything if it already exists
-                    },
-                )
+                try:
+                    new_user_row = await self.db.litellm_usertable.upsert(
+                        where={"user_id": data["user_id"]},
+                        data={
+                            "create": {**db_data},  # type: ignore
+                            "update": {},  # don't do anything if it already exists
+                        },
+                    )
+                except Exception as e:
+                    if (
+                        "Foreign key constraint failed on the field: `LiteLLM_UserTable_organization_id_fkey (index)`"
+                        in str(e)
+                    ):
+                        raise HTTPException(
+                            status_code=400,
+                            detail={
+                                "error": f"Foreign Key Constraint failed. Organization ID={db_data['organization_id']} does not exist in LiteLLM_OrganizationTable. Create via `/organization/new`."
+                            },
+                        )
+                    raise e
                verbose_proxy_logger.info("Data Inserted into User Table")
                return new_user_row
            elif table_name == "team":
@ -2132,6 +2151,46 @@ async def update_spend(
                )
                raise e

+    ### UPDATE ORG TABLE ###
+    if len(prisma_client.org_list_transactons.keys()) > 0:
+        for i in range(n_retry_times + 1):
+            try:
+                async with prisma_client.db.tx(
+                    timeout=timedelta(seconds=60)
+                ) as transaction:
+                    async with transaction.batch_() as batcher:
+                        for (
+                            org_id,
+                            response_cost,
+                        ) in prisma_client.org_list_transactons.items():
+                            batcher.litellm_organizationtable.update_many(  # 'update_many' prevents error from being raised if no row exists
+                                where={"organization_id": org_id},
+                                data={"spend": {"increment": response_cost}},
+                            )
+                prisma_client.org_list_transactons = (
+                    {}
+                )  # Clear the remaining transactions after processing all batches in the loop.
+                break
+            except httpx.ReadTimeout:
+                if i >= n_retry_times:  # If we've reached the maximum number of retries
+                    raise  # Re-raise the last exception
+                # Optionally, sleep for a bit before retrying
+                await asyncio.sleep(2**i)  # Exponential backoff
+            except Exception as e:
+                import traceback
+
+                error_msg = (
+                    f"LiteLLM Prisma Client Exception - update org spend: {str(e)}"
+                )
+                print_verbose(error_msg)
+                error_traceback = error_msg + "\n" + traceback.format_exc()
+                asyncio.create_task(
+                    proxy_logging_obj.failure_handler(
+                        original_exception=e, traceback_str=error_traceback
+                    )
+                )
+                raise e
+
    ### UPDATE SPEND LOGS ###
    verbose_proxy_logger.debug(
        "Spend Logs transactions: {}".format(len(prisma_client.spend_log_transactions))
--- a/litellm/router.py
+++ b/litellm/router.py
@ -11,9 +11,9 @@ import copy, httpx
 from datetime import datetime
 from typing import Dict, List, Optional, Union, Literal, Any, BinaryIO
 import random, threading, time, traceback, uuid
-import litellm, openai
+import litellm, openai, hashlib, json
 from litellm.caching import RedisCache, InMemoryCache, DualCache
-
+import datetime as datetime_og
 import logging, asyncio
 import inspect, concurrent
 from openai import AsyncOpenAI
@ -21,15 +21,16 @@ from collections import defaultdict
 from litellm.router_strategy.least_busy import LeastBusyLoggingHandler
 from litellm.router_strategy.lowest_tpm_rpm import LowestTPMLoggingHandler
 from litellm.router_strategy.lowest_latency import LowestLatencyLoggingHandler
+from litellm.router_strategy.lowest_tpm_rpm_v2 import LowestTPMLoggingHandler_v2
 from litellm.llms.custom_httpx.azure_dall_e_2 import (
    CustomHTTPTransport,
    AsyncCustomHTTPTransport,
 )
-from litellm.utils import ModelResponse, CustomStreamWrapper
+from litellm.utils import ModelResponse, CustomStreamWrapper, get_utc_datetime
 import copy
 from litellm._logging import verbose_router_logger
 import logging
-from litellm.types.router import Deployment, ModelInfo, LiteLLM_Params
+from litellm.types.router import Deployment, ModelInfo, LiteLLM_Params, RouterErrors


 class Router:
@ -77,6 +78,7 @@ class Router:
            "latency-based-routing",
        ] = "simple-shuffle",
        routing_strategy_args: dict = {},  # just for latency-based routing
+        semaphore: Optional[asyncio.Semaphore] = None,
    ) -> None:
        """
        Initialize the Router class with the given parameters for caching, reliability, and routing strategy.
@ -142,6 +144,8 @@ class Router:
        router = Router(model_list=model_list, fallbacks=[{"azure-gpt-3.5-turbo": "openai-gpt-3.5-turbo"}])
        ```
        """
+        if semaphore:
+            self.semaphore = semaphore
        self.set_verbose = set_verbose
        self.debug_level = debug_level
        self.enable_pre_call_checks = enable_pre_call_checks
@ -273,6 +277,12 @@ class Router:
            )
            if isinstance(litellm.callbacks, list):
                litellm.callbacks.append(self.lowesttpm_logger)  # type: ignore
+        elif routing_strategy == "usage-based-routing-v2":
+            self.lowesttpm_logger_v2 = LowestTPMLoggingHandler_v2(
+                router_cache=self.cache, model_list=self.model_list
+            )
+            if isinstance(litellm.callbacks, list):
+                litellm.callbacks.append(self.lowesttpm_logger_v2)  # type: ignore
        elif routing_strategy == "latency-based-routing":
            self.lowestlatency_logger = LowestLatencyLoggingHandler(
                router_cache=self.cache,
@ -402,12 +412,19 @@ class Router:
            raise e

    async def _acompletion(self, model: str, messages: List[Dict[str, str]], **kwargs):
+        """
+        - Get an available deployment
+        - call it with a semaphore over the call
+        - semaphore specific to it's rpm
+        - in the semaphore,  make a check against it's local rpm before running
+        """
        model_name = None
        try:
            verbose_router_logger.debug(
                f"Inside _acompletion()- model: {model}; kwargs: {kwargs}"
            )
-            deployment = self.get_available_deployment(
+
+            deployment = await self.async_get_available_deployment(
                model=model,
                messages=messages,
                specific_deployment=kwargs.pop("specific_deployment", None),
@ -436,6 +453,7 @@ class Router:
            potential_model_client = self._get_client(
                deployment=deployment, kwargs=kwargs, client_type="async"
            )
+
            # check if provided keys == client keys #
            dynamic_api_key = kwargs.get("api_key", None)
            if (
@ -458,7 +476,7 @@ class Router:
                )  # this uses default_litellm_params when nothing is set
            )

-            response = await litellm.acompletion(
+            _response = litellm.acompletion(
                **{
                    **data,
                    "messages": messages,
@ -468,6 +486,25 @@ class Router:
                    **kwargs,
                }
            )
+
+            rpm_semaphore = self._get_client(
+                deployment=deployment, kwargs=kwargs, client_type="rpm_client"
+            )
+
+            if (
+                rpm_semaphore is not None
+                and isinstance(rpm_semaphore, asyncio.Semaphore)
+                and self.routing_strategy == "usage-based-routing-v2"
+            ):
+                async with rpm_semaphore:
+                    """
+                    - Check rpm limits before making the call
+                    """
+                    await self.lowesttpm_logger_v2.pre_call_rpm_check(deployment)
+                    response = await _response
+            else:
+                response = await _response
+
            self.success_calls[model_name] += 1
            verbose_router_logger.info(
                f"litellm.acompletion(model={model_name})\033[32m 200 OK\033[0m"
@ -581,7 +618,7 @@ class Router:
            verbose_router_logger.debug(
                f"Inside _image_generation()- model: {model}; kwargs: {kwargs}"
            )
-            deployment = self.get_available_deployment(
+            deployment = await self.async_get_available_deployment(
                model=model,
                messages=[{"role": "user", "content": "prompt"}],
                specific_deployment=kwargs.pop("specific_deployment", None),
@ -681,7 +718,7 @@ class Router:
            verbose_router_logger.debug(
                f"Inside _atranscription()- model: {model}; kwargs: {kwargs}"
            )
-            deployment = self.get_available_deployment(
+            deployment = await self.async_get_available_deployment(
                model=model,
                messages=[{"role": "user", "content": "prompt"}],
                specific_deployment=kwargs.pop("specific_deployment", None),
@ -761,7 +798,7 @@ class Router:
            verbose_router_logger.debug(
                f"Inside _moderation()- model: {model}; kwargs: {kwargs}"
            )
-            deployment = self.get_available_deployment(
+            deployment = await self.async_get_available_deployment(
                model=model,
                input=input,
                specific_deployment=kwargs.pop("specific_deployment", None),
@ -904,7 +941,7 @@ class Router:
            verbose_router_logger.debug(
                f"Inside _atext_completion()- model: {model}; kwargs: {kwargs}"
            )
-            deployment = self.get_available_deployment(
+            deployment = await self.async_get_available_deployment(
                model=model,
                messages=[{"role": "user", "content": prompt}],
                specific_deployment=kwargs.pop("specific_deployment", None),
@ -1070,7 +1107,7 @@ class Router:
            verbose_router_logger.debug(
                f"Inside _aembedding()- model: {model}; kwargs: {kwargs}"
            )
-            deployment = self.get_available_deployment(
+            deployment = await self.async_get_available_deployment(
                model=model,
                input=input,
                specific_deployment=kwargs.pop("specific_deployment", None),
@ -1258,6 +1295,8 @@ class Router:
                    min_timeout=self.retry_after,
                )
                await asyncio.sleep(timeout)
+            elif RouterErrors.user_defined_ratelimit_error.value in str(e):
+                raise e  # don't wait to retry if deployment hits user-defined rate-limit
            elif hasattr(original_exception, "status_code") and litellm._should_retry(
                status_code=original_exception.status_code
            ):
@ -1598,7 +1637,8 @@ class Router:
        if deployment is None:
            return

-        current_minute = datetime.now().strftime("%H-%M")
+        dt = get_utc_datetime()
+        current_minute = dt.strftime("%H-%M")
        # get current fails for deployment
        # update the number of failed calls
        # if it's > allowed fails
@ -1636,11 +1676,29 @@ class Router:
                key=deployment, value=updated_fails, ttl=cooldown_time
            )

+    async def _async_get_cooldown_deployments(self):
+        """
+        Async implementation of '_get_cooldown_deployments'
+        """
+        dt = get_utc_datetime()
+        current_minute = dt.strftime("%H-%M")
+        # get the current cooldown list for that minute
+        cooldown_key = f"{current_minute}:cooldown_models"
+
+        # ----------------------
+        # Return cooldown models
+        # ----------------------
+        cooldown_models = await self.cache.async_get_cache(key=cooldown_key) or []
+
+        verbose_router_logger.debug(f"retrieve cooldown models: {cooldown_models}")
+        return cooldown_models
+
    def _get_cooldown_deployments(self):
        """
        Get the list of models being cooled down for this minute
        """
-        current_minute = datetime.now().strftime("%H-%M")
+        dt = get_utc_datetime()
+        current_minute = dt.strftime("%H-%M")
        # get the current cooldown list for that minute
        cooldown_key = f"{current_minute}:cooldown_models"

@ -1654,12 +1712,26 @@ class Router:

    def set_client(self, model: dict):
        """
-        Initializes Azure/OpenAI clients. Stores them in cache, b/c of this - https://github.com/BerriAI/litellm/issues/1278
+        - Initializes Azure/OpenAI clients. Stores them in cache, b/c of this - https://github.com/BerriAI/litellm/issues/1278
+        - Initializes Semaphore for client w/ rpm. Stores them in cache. b/c of this - https://github.com/BerriAI/litellm/issues/2994
        """
        client_ttl = self.client_ttl
        litellm_params = model.get("litellm_params", {})
        model_name = litellm_params.get("model")
        model_id = model["model_info"]["id"]
+        # ### IF RPM SET - initialize a semaphore ###
+        rpm = litellm_params.get("rpm", None)
+        if rpm:
+            semaphore = asyncio.Semaphore(rpm)
+            cache_key = f"{model_id}_rpm_client"
+            self.cache.set_cache(
+                key=cache_key,
+                value=semaphore,
+                local_only=True,
+            )
+
+        #     print("STORES SEMAPHORE IN CACHE")
+
        ####  for OpenAI / Azure we need to initalize the Client for High Traffic ########
        custom_llm_provider = litellm_params.get("custom_llm_provider")
        custom_llm_provider = custom_llm_provider or model_name.split("/", 1)[0] or ""
@ -1874,8 +1946,12 @@ class Router:
                        local_only=True,
                    )  # cache for 1 hr
                else:
+                    _api_key = api_key
+                    if _api_key is not None and isinstance(_api_key, str):
+                        # only show first 5 chars of api_key
+                        _api_key = _api_key[:8] + "*" * 15
                    verbose_router_logger.debug(
-                        f"Initializing Azure OpenAI Client for {model_name}, Api Base: {str(api_base)}, Api Key:{api_key}"
+                        f"Initializing Azure OpenAI Client for {model_name}, Api Base: {str(api_base)}, Api Key:{_api_key}"
                    )
                    azure_client_params = {
                        "api_key": api_key,
@ -1972,8 +2048,12 @@ class Router:
                    )  # cache for 1 hr

            else:
+                _api_key = api_key
+                if _api_key is not None and isinstance(_api_key, str):
+                    # only show first 5 chars of api_key
+                    _api_key = _api_key[:8] + "*" * 15
                verbose_router_logger.debug(
-                    f"Initializing OpenAI Client for {model_name}, Api Base:{str(api_base)}, Api Key:{api_key}"
+                    f"Initializing OpenAI Client for {model_name}, Api Base:{str(api_base)}, Api Key:{_api_key}"
                )
                cache_key = f"{model_id}_async_client"
                _client = openai.AsyncOpenAI(  # type: ignore
@ -2065,6 +2145,34 @@ class Router:
                    local_only=True,
                )  # cache for 1 hr

+    def _generate_model_id(self, model_group: str, litellm_params: dict):
+        """
+        Helper function to consistently generate the same id for a deployment
+
+        - create a string from all the litellm params
+        - hash
+        - use hash as id
+        """
+        concat_str = model_group
+        for k, v in litellm_params.items():
+            if isinstance(k, str):
+                concat_str += k
+            elif isinstance(k, dict):
+                concat_str += json.dumps(k)
+            else:
+                concat_str += str(k)
+
+            if isinstance(v, str):
+                concat_str += v
+            elif isinstance(v, dict):
+                concat_str += json.dumps(v)
+            else:
+                concat_str += str(v)
+
+        hash_object = hashlib.sha256(concat_str.encode())
+
+        return hash_object.hexdigest()
+
    def set_model_list(self, model_list: list):
        original_model_list = copy.deepcopy(model_list)
        self.model_list = []
@ -2080,7 +2188,13 @@ class Router:
                    if isinstance(v, str) and v.startswith("os.environ/"):
                        _litellm_params[k] = litellm.get_secret(v)

-            _model_info = model.pop("model_info", {})
+            _model_info: dict = model.pop("model_info", {})
+
+            # check if model info has id
+            if "id" not in _model_info:
+                _id = self._generate_model_id(_model_name, _litellm_params)
+                _model_info["id"] = _id
+
            deployment = Deployment(
                **model,
                model_name=_model_name,
@ -2207,7 +2321,11 @@ class Router:
            The appropriate client based on the given client_type and kwargs.
        """
        model_id = deployment["model_info"]["id"]
-        if client_type == "async":
+        if client_type == "rpm_client":
+            cache_key = "{}_rpm_client".format(model_id)
+            client = self.cache.get_cache(key=cache_key, local_only=True)
+            return client
+        elif client_type == "async":
            if kwargs.get("stream") == True:
                cache_key = f"{model_id}_stream_async_client"
                client = self.cache.get_cache(key=cache_key, local_only=True)
@ -2260,6 +2378,7 @@ class Router:
        Filter out model in model group, if:

        - model context window < message length
+        - filter models above rpm limits
        - [TODO] function call and model doesn't support function calling
        """
        verbose_router_logger.debug(
@ -2279,11 +2398,12 @@ class Router:
        _rate_limit_error = False

        ## get model group RPM ##
-        current_minute = datetime.now().strftime("%H-%M")
+        dt = get_utc_datetime()
+        current_minute = dt.strftime("%H-%M")
        rpm_key = f"{model}:rpm:{current_minute}"
        model_group_cache = (
            self.cache.get_cache(key=rpm_key, local_only=True) or {}
-        )  # check the redis + in-memory cache used by lowest_latency and usage-based routing. Only check the local cache.
+        )  # check the in-memory cache used by lowest_latency and usage-based routing. Only check the local cache.
        for idx, deployment in enumerate(_returned_deployments):
            # see if we have the info for this model
            try:
@ -2296,20 +2416,20 @@ class Router:
                    "model", None
                )
                model_info = litellm.get_model_info(model=model)
-            except:
-                continue

-            if (
-                isinstance(model_info, dict)
-                and model_info.get("max_input_tokens", None) is not None
-            ):
                if (
-                    isinstance(model_info["max_input_tokens"], int)
-                    and input_tokens > model_info["max_input_tokens"]
+                    isinstance(model_info, dict)
+                    and model_info.get("max_input_tokens", None) is not None
                ):
-                    invalid_model_indices.append(idx)
-                    _context_window_error = True
-                    continue
+                    if (
+                        isinstance(model_info["max_input_tokens"], int)
+                        and input_tokens > model_info["max_input_tokens"]
+                    ):
+                        invalid_model_indices.append(idx)
+                        _context_window_error = True
+                        continue
+            except Exception as e:
+                verbose_router_logger.debug("An error occurs - {}".format(str(e)))

            ## RPM CHECK ##
            _litellm_params = deployment.get("litellm_params", {})
@ -2319,23 +2439,24 @@ class Router:
                self.cache.get_cache(key=model_id, local_only=True) or 0
            )
            ### get usage based cache ###
-            model_group_cache[model_id] = model_group_cache.get(model_id, 0)
+            if isinstance(model_group_cache, dict):
+                model_group_cache[model_id] = model_group_cache.get(model_id, 0)

-            current_request = max(
-                current_request_cache_local, model_group_cache[model_id]
-            )
+                current_request = max(
+                    current_request_cache_local, model_group_cache[model_id]
+                )

-            if (
-                isinstance(_litellm_params, dict)
-                and _litellm_params.get("rpm", None) is not None
-            ):
                if (
-                    isinstance(_litellm_params["rpm"], int)
-                    and _litellm_params["rpm"] <= current_request
+                    isinstance(_litellm_params, dict)
+                    and _litellm_params.get("rpm", None) is not None
                ):
-                    invalid_model_indices.append(idx)
-                    _rate_limit_error = True
-                    continue
+                    if (
+                        isinstance(_litellm_params["rpm"], int)
+                        and _litellm_params["rpm"] <= current_request
+                    ):
+                        invalid_model_indices.append(idx)
+                        _rate_limit_error = True
+                        continue

        if len(invalid_model_indices) == len(_returned_deployments):
            """
@ -2364,7 +2485,7 @@ class Router:

        return _returned_deployments

-    def get_available_deployment(
+    def _common_checks_available_deployment(
        self,
        model: str,
        messages: Optional[List[Dict[str, str]]] = None,
@ -2372,11 +2493,11 @@ class Router:
        specific_deployment: Optional[bool] = False,
    ):
        """
-        Returns the deployment based on routing strategy
-        """
+        Common checks for 'get_available_deployment' across sync + async call.

-        # users need to explicitly call a specific deployment, by setting `specific_deployment = True` as completion()/embedding() kwarg
-        # When this was no explicit we had several issues with fallbacks timing out
+        If 'healthy_deployments' returned is None, this means the user chose a specific deployment
+        """
+        # check if aliases set on litellm model alias map
        if specific_deployment == True:
            # users can also specify a specific deployment name. At this point we should check if they are just trying to call a specific deployment
            for deployment in self.model_list:
@ -2384,12 +2505,11 @@ class Router:
                if deployment_model == model:
                    # User Passed a specific deployment name on their config.yaml, example azure/chat-gpt-v-2
                    # return the first deployment where the `model` matches the specificed deployment name
-                    return deployment
+                    return deployment, None
            raise ValueError(
                f"LiteLLM Router: Trying to call specific deployment, but Model:{model} does not exist in Model List: {self.model_list}"
            )

-        # check if aliases set on litellm model alias map
        if model in self.model_group_alias:
            verbose_router_logger.debug(
                f"Using a model alias. Got Request for {model}, sending requests to {self.model_group_alias.get(model)}"
@ -2401,7 +2521,7 @@ class Router:
                self.default_deployment
            )  # self.default_deployment
            updated_deployment["litellm_params"]["model"] = model
-            return updated_deployment
+            return updated_deployment, None

        ## get healthy deployments
        ### get all deployments
@ -2416,6 +2536,118 @@ class Router:
            f"initial list of deployments: {healthy_deployments}"
        )

+        verbose_router_logger.debug(
+            f"healthy deployments: length {len(healthy_deployments)} {healthy_deployments}"
+        )
+        if len(healthy_deployments) == 0:
+            raise ValueError(f"No healthy deployment available, passed model={model}")
+        if litellm.model_alias_map and model in litellm.model_alias_map:
+            model = litellm.model_alias_map[
+                model
+            ]  # update the model to the actual value if an alias has been passed in
+
+        return model, healthy_deployments
+
+    async def async_get_available_deployment(
+        self,
+        model: str,
+        messages: Optional[List[Dict[str, str]]] = None,
+        input: Optional[Union[str, List]] = None,
+        specific_deployment: Optional[bool] = False,
+    ):
+        """
+        Async implementation of 'get_available_deployments'.
+
+        Allows all cache calls to be made async => 10x perf impact (8rps -> 100 rps).
+        """
+        if (
+            self.routing_strategy != "usage-based-routing-v2"
+        ):  # prevent regressions for other routing strategies, that don't have async get available deployments implemented.
+            return self.get_available_deployment(
+                model=model,
+                messages=messages,
+                input=input,
+                specific_deployment=specific_deployment,
+            )
+
+        model, healthy_deployments = self._common_checks_available_deployment(
+            model=model,
+            messages=messages,
+            input=input,
+            specific_deployment=specific_deployment,
+        )
+
+        if healthy_deployments is None:
+            return model
+
+        # filter out the deployments currently cooling down
+        deployments_to_remove = []
+        # cooldown_deployments is a list of model_id's cooling down, cooldown_deployments = ["16700539-b3cd-42f4-b426-6a12a1bb706a", "16700539-b3cd-42f4-b426-7899"]
+        cooldown_deployments = await self._async_get_cooldown_deployments()
+        verbose_router_logger.debug(
+            f"async cooldown deployments: {cooldown_deployments}"
+        )
+        # Find deployments in model_list whose model_id is cooling down
+        for deployment in healthy_deployments:
+            deployment_id = deployment["model_info"]["id"]
+            if deployment_id in cooldown_deployments:
+                deployments_to_remove.append(deployment)
+        # remove unhealthy deployments from healthy deployments
+        for deployment in deployments_to_remove:
+            healthy_deployments.remove(deployment)
+
+        # filter pre-call checks
+        if self.enable_pre_call_checks and messages is not None:
+            healthy_deployments = self._pre_call_checks(
+                model=model, healthy_deployments=healthy_deployments, messages=messages
+            )
+
+        if (
+            self.routing_strategy == "usage-based-routing-v2"
+            and self.lowesttpm_logger_v2 is not None
+        ):
+            deployment = await self.lowesttpm_logger_v2.async_get_available_deployments(
+                model_group=model,
+                healthy_deployments=healthy_deployments,
+                messages=messages,
+                input=input,
+            )
+
+        if deployment is None:
+            verbose_router_logger.info(
+                f"get_available_deployment for model: {model}, No deployment available"
+            )
+            raise ValueError(
+                f"No deployments available for selected model, passed model={model}"
+            )
+        verbose_router_logger.info(
+            f"get_available_deployment for model: {model}, Selected deployment: {self.print_deployment(deployment)} for model: {model}"
+        )
+        return deployment
+
+    def get_available_deployment(
+        self,
+        model: str,
+        messages: Optional[List[Dict[str, str]]] = None,
+        input: Optional[Union[str, List]] = None,
+        specific_deployment: Optional[bool] = False,
+    ):
+        """
+        Returns the deployment based on routing strategy
+        """
+        # users need to explicitly call a specific deployment, by setting `specific_deployment = True` as completion()/embedding() kwarg
+        # When this was no explicit we had several issues with fallbacks timing out
+
+        model, healthy_deployments = self._common_checks_available_deployment(
+            model=model,
+            messages=messages,
+            input=input,
+            specific_deployment=specific_deployment,
+        )
+
+        if healthy_deployments is None:
+            return model
+
        # filter out the deployments currently cooling down
        deployments_to_remove = []
        # cooldown_deployments is a list of model_id's cooling down, cooldown_deployments = ["16700539-b3cd-42f4-b426-6a12a1bb706a", "16700539-b3cd-42f4-b426-7899"]
@ -2436,16 +2668,6 @@ class Router:
                model=model, healthy_deployments=healthy_deployments, messages=messages
            )

-        verbose_router_logger.debug(
-            f"healthy deployments: length {len(healthy_deployments)} {healthy_deployments}"
-        )
-        if len(healthy_deployments) == 0:
-            raise ValueError(f"No healthy deployment available, passed model={model}")
-        if litellm.model_alias_map and model in litellm.model_alias_map:
-            model = litellm.model_alias_map[
-                model
-            ]  # update the model to the actual value if an alias has been passed in
-
        if self.routing_strategy == "least-busy" and self.leastbusy_logger is not None:
            deployment = self.leastbusy_logger.get_available_deployments(
                model_group=model, healthy_deployments=healthy_deployments
@ -2507,7 +2729,16 @@ class Router:
                messages=messages,
                input=input,
            )
-
+        elif (
+            self.routing_strategy == "usage-based-routing-v2"
+            and self.lowesttpm_logger_v2 is not None
+        ):
+            deployment = self.lowesttpm_logger_v2.get_available_deployments(
+                model_group=model,
+                healthy_deployments=healthy_deployments,
+                messages=messages,
+                input=input,
+            )
        if deployment is None:
            verbose_router_logger.info(
                f"get_available_deployment for model: {model}, No deployment available"
--- a/litellm/router_strategy/lowest_tpm_rpm_v2.py
+++ b/litellm/router_strategy/lowest_tpm_rpm_v2.py
@ -0,0 +1,403 @@
+#### What this does ####
+#   identifies lowest tpm deployment
+
+import dotenv, os, requests, random
+from typing import Optional, Union, List, Dict
+import datetime as datetime_og
+from datetime import datetime
+
+dotenv.load_dotenv()  # Loading env variables using dotenv
+import traceback, asyncio, httpx
+import litellm
+from litellm import token_counter
+from litellm.caching import DualCache
+from litellm.integrations.custom_logger import CustomLogger
+from litellm._logging import verbose_router_logger
+from litellm.utils import print_verbose, get_utc_datetime
+from litellm.types.router import RouterErrors
+
+
+class LowestTPMLoggingHandler_v2(CustomLogger):
+    """
+    Updated version of TPM/RPM Logging.
+
+    Meant to work across instances.
+
+    Caches individual models, not model_groups
+
+    Uses batch get (redis.mget)
+
+    Increments tpm/rpm limit using redis.incr
+    """
+
+    test_flag: bool = False
+    logged_success: int = 0
+    logged_failure: int = 0
+    default_cache_time_seconds: int = 1 * 60 * 60  # 1 hour
+
+    def __init__(self, router_cache: DualCache, model_list: list):
+        self.router_cache = router_cache
+        self.model_list = model_list
+
+    async def pre_call_rpm_check(self, deployment: dict) -> dict:
+        """
+        Pre-call check + update model rpm
+        - Used inside semaphore
+        - raise rate limit error if deployment over limit
+
+        Why? solves concurrency issue - https://github.com/BerriAI/litellm/issues/2994
+
+        Returns - deployment
+
+        Raises - RateLimitError if deployment over defined RPM limit
+        """
+        try:
+
+            # ------------
+            # Setup values
+            # ------------
+            dt = get_utc_datetime()
+            current_minute = dt.strftime("%H-%M")
+            model_group = deployment.get("model_name", "")
+            rpm_key = f"{model_group}:rpm:{current_minute}"
+            local_result = await self.router_cache.async_get_cache(
+                key=rpm_key, local_only=True
+            )  # check local result first
+
+            deployment_rpm = None
+            if deployment_rpm is None:
+                deployment_rpm = deployment.get("rpm")
+            if deployment_rpm is None:
+                deployment_rpm = deployment.get("litellm_params", {}).get("rpm")
+            if deployment_rpm is None:
+                deployment_rpm = deployment.get("model_info", {}).get("rpm")
+            if deployment_rpm is None:
+                deployment_rpm = float("inf")
+
+            if local_result is not None and local_result >= deployment_rpm:
+                raise litellm.RateLimitError(
+                    message="Deployment over defined rpm limit={}. current usage={}".format(
+                        deployment_rpm, local_result
+                    ),
+                    llm_provider="",
+                    model=deployment.get("litellm_params", {}).get("model"),
+                    response=httpx.Response(
+                        status_code=429,
+                        content="{} rpm limit={}. current usage={}".format(
+                            RouterErrors.user_defined_ratelimit_error.value,
+                            deployment_rpm,
+                            local_result,
+                        ),
+                        request=httpx.Request(method="tpm_rpm_limits", url="https://github.com/BerriAI/litellm"),  # type: ignore
+                    ),
+                )
+            else:
+                # if local result below limit, check redis ## prevent unnecessary redis checks
+                result = await self.router_cache.async_increment_cache(
+                    key=rpm_key, value=1
+                )
+                if result is not None and result > deployment_rpm:
+                    raise litellm.RateLimitError(
+                        message="Deployment over defined rpm limit={}. current usage={}".format(
+                            deployment_rpm, result
+                        ),
+                        llm_provider="",
+                        model=deployment.get("litellm_params", {}).get("model"),
+                        response=httpx.Response(
+                            status_code=429,
+                            content="{} rpm limit={}. current usage={}".format(
+                                RouterErrors.user_defined_ratelimit_error.value,
+                                deployment_rpm,
+                                result,
+                            ),
+                            request=httpx.Request(method="tpm_rpm_limits", url="https://github.com/BerriAI/litellm"),  # type: ignore
+                        ),
+                    )
+            return deployment
+        except Exception as e:
+            if isinstance(e, litellm.RateLimitError):
+                raise e
+            return deployment  # don't fail calls if eg. redis fails to connect
+
+    def log_success_event(self, kwargs, response_obj, start_time, end_time):
+        try:
+            """
+            Update TPM/RPM usage on success
+            """
+            if kwargs["litellm_params"].get("metadata") is None:
+                pass
+            else:
+                model_group = kwargs["litellm_params"]["metadata"].get(
+                    "model_group", None
+                )
+
+                id = kwargs["litellm_params"].get("model_info", {}).get("id", None)
+                if model_group is None or id is None:
+                    return
+                elif isinstance(id, int):
+                    id = str(id)
+
+                total_tokens = response_obj["usage"]["total_tokens"]
+
+                # ------------
+                # Setup values
+                # ------------
+                dt = get_utc_datetime()
+                current_minute = dt.strftime("%H-%M")
+                tpm_key = f"{model_group}:tpm:{current_minute}"
+                rpm_key = f"{model_group}:rpm:{current_minute}"
+
+                # ------------
+                # Update usage
+                # ------------
+
+                ## TPM
+                request_count_dict = self.router_cache.get_cache(key=tpm_key) or {}
+                request_count_dict[id] = request_count_dict.get(id, 0) + total_tokens
+
+                self.router_cache.set_cache(key=tpm_key, value=request_count_dict)
+
+                ## RPM
+                request_count_dict = self.router_cache.get_cache(key=rpm_key) or {}
+                request_count_dict[id] = request_count_dict.get(id, 0) + 1
+
+                self.router_cache.set_cache(key=rpm_key, value=request_count_dict)
+
+                ### TESTING ###
+                if self.test_flag:
+                    self.logged_success += 1
+        except Exception as e:
+            traceback.print_exc()
+            pass
+
+    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
+        try:
+            """
+            Update TPM usage on success
+            """
+            if kwargs["litellm_params"].get("metadata") is None:
+                pass
+            else:
+                model_group = kwargs["litellm_params"]["metadata"].get(
+                    "model_group", None
+                )
+
+                id = kwargs["litellm_params"].get("model_info", {}).get("id", None)
+                if model_group is None or id is None:
+                    return
+                elif isinstance(id, int):
+                    id = str(id)
+
+                total_tokens = response_obj["usage"]["total_tokens"]
+
+                # ------------
+                # Setup values
+                # ------------
+                dt = get_utc_datetime()
+                current_minute = dt.strftime(
+                    "%H-%M"
+                )  # use the same timezone regardless of system clock
+
+                tpm_key = f"{id}:tpm:{current_minute}"
+                # ------------
+                # Update usage
+                # ------------
+                # update cache
+
+                ## TPM
+                await self.router_cache.async_increment_cache(
+                    key=tpm_key, value=total_tokens
+                )
+
+                ### TESTING ###
+                if self.test_flag:
+                    self.logged_success += 1
+        except Exception as e:
+            traceback.print_exc()
+            pass
+
+    def _common_checks_available_deployment(
+        self,
+        model_group: str,
+        healthy_deployments: list,
+        tpm_keys: list,
+        tpm_values: list,
+        rpm_keys: list,
+        rpm_values: list,
+        messages: Optional[List[Dict[str, str]]] = None,
+        input: Optional[Union[str, List]] = None,
+    ):
+        """
+        Common checks for get available deployment, across sync + async implementations
+        """
+        tpm_dict = {}  # {model_id: 1, ..}
+        for idx, key in enumerate(tpm_keys):
+            tpm_dict[tpm_keys[idx]] = tpm_values[idx]
+
+        rpm_dict = {}  # {model_id: 1, ..}
+        for idx, key in enumerate(rpm_keys):
+            rpm_dict[rpm_keys[idx]] = rpm_values[idx]
+
+        try:
+            input_tokens = token_counter(messages=messages, text=input)
+        except:
+            input_tokens = 0
+        verbose_router_logger.debug(f"input_tokens={input_tokens}")
+        # -----------------------
+        # Find lowest used model
+        # ----------------------
+        lowest_tpm = float("inf")
+
+        if tpm_dict is None:  # base case - none of the deployments have been used
+            # initialize a tpm dict with {model_id: 0}
+            tpm_dict = {}
+            for deployment in healthy_deployments:
+                tpm_dict[deployment["model_info"]["id"]] = 0
+        else:
+            for d in healthy_deployments:
+                ## if healthy deployment not yet used
+                if d["model_info"]["id"] not in tpm_dict:
+                    tpm_dict[d["model_info"]["id"]] = 0
+
+        all_deployments = tpm_dict
+
+        deployment = None
+        for item, item_tpm in all_deployments.items():
+            ## get the item from model list
+            _deployment = None
+            for m in healthy_deployments:
+                if item == m["model_info"]["id"]:
+                    _deployment = m
+
+            if _deployment is None:
+                continue  # skip to next one
+
+            _deployment_tpm = None
+            if _deployment_tpm is None:
+                _deployment_tpm = _deployment.get("tpm")
+            if _deployment_tpm is None:
+                _deployment_tpm = _deployment.get("litellm_params", {}).get("tpm")
+            if _deployment_tpm is None:
+                _deployment_tpm = _deployment.get("model_info", {}).get("tpm")
+            if _deployment_tpm is None:
+                _deployment_tpm = float("inf")
+
+            _deployment_rpm = None
+            if _deployment_rpm is None:
+                _deployment_rpm = _deployment.get("rpm")
+            if _deployment_rpm is None:
+                _deployment_rpm = _deployment.get("litellm_params", {}).get("rpm")
+            if _deployment_rpm is None:
+                _deployment_rpm = _deployment.get("model_info", {}).get("rpm")
+            if _deployment_rpm is None:
+                _deployment_rpm = float("inf")
+
+            if item_tpm + input_tokens > _deployment_tpm:
+                continue
+            elif (rpm_dict is not None and item in rpm_dict) and (
+                rpm_dict[item] + 1 > _deployment_rpm
+            ):
+                continue
+            elif item_tpm < lowest_tpm:
+                lowest_tpm = item_tpm
+                deployment = _deployment
+        print_verbose("returning picked lowest tpm/rpm deployment.")
+        return deployment
+
+    async def async_get_available_deployments(
+        self,
+        model_group: str,
+        healthy_deployments: list,
+        messages: Optional[List[Dict[str, str]]] = None,
+        input: Optional[Union[str, List]] = None,
+    ):
+        """
+        Async implementation of get deployments.
+
+        Reduces time to retrieve the tpm/rpm values from cache
+        """
+        # get list of potential deployments
+        verbose_router_logger.debug(
+            f"get_available_deployments - Usage Based. model_group: {model_group}, healthy_deployments: {healthy_deployments}"
+        )
+
+        dt = get_utc_datetime()
+        current_minute = dt.strftime("%H-%M")
+        tpm_keys = []
+        rpm_keys = []
+        for m in healthy_deployments:
+            if isinstance(m, dict):
+                id = m.get("model_info", {}).get(
+                    "id"
+                )  # a deployment should always have an 'id'. this is set in router.py
+                tpm_key = "{}:tpm:{}".format(id, current_minute)
+                rpm_key = "{}:rpm:{}".format(id, current_minute)
+
+                tpm_keys.append(tpm_key)
+                rpm_keys.append(rpm_key)
+
+        tpm_values = await self.router_cache.async_batch_get_cache(
+            keys=tpm_keys
+        )  # [1, 2, None, ..]
+        rpm_values = await self.router_cache.async_batch_get_cache(
+            keys=rpm_keys
+        )  # [1, 2, None, ..]
+
+        return self._common_checks_available_deployment(
+            model_group=model_group,
+            healthy_deployments=healthy_deployments,
+            tpm_keys=tpm_keys,
+            tpm_values=tpm_values,
+            rpm_keys=rpm_keys,
+            rpm_values=rpm_values,
+            messages=messages,
+            input=input,
+        )
+
+    def get_available_deployments(
+        self,
+        model_group: str,
+        healthy_deployments: list,
+        messages: Optional[List[Dict[str, str]]] = None,
+        input: Optional[Union[str, List]] = None,
+    ):
+        """
+        Returns a deployment with the lowest TPM/RPM usage.
+        """
+        # get list of potential deployments
+        verbose_router_logger.debug(
+            f"get_available_deployments - Usage Based. model_group: {model_group}, healthy_deployments: {healthy_deployments}"
+        )
+
+        dt = get_utc_datetime()
+        current_minute = dt.strftime("%H-%M")
+        tpm_keys = []
+        rpm_keys = []
+        for m in healthy_deployments:
+            if isinstance(m, dict):
+                id = m.get("model_info", {}).get(
+                    "id"
+                )  # a deployment should always have an 'id'. this is set in router.py
+                tpm_key = "{}:tpm:{}".format(id, current_minute)
+                rpm_key = "{}:rpm:{}".format(id, current_minute)
+
+                tpm_keys.append(tpm_key)
+                rpm_keys.append(rpm_key)
+
+        tpm_values = self.router_cache.batch_get_cache(
+            keys=tpm_keys
+        )  # [1, 2, None, ..]
+        rpm_values = self.router_cache.batch_get_cache(
+            keys=rpm_keys
+        )  # [1, 2, None, ..]
+
+        return self._common_checks_available_deployment(
+            model_group=model_group,
+            healthy_deployments=healthy_deployments,
+            tpm_keys=tpm_keys,
+            tpm_values=tpm_values,
+            rpm_keys=rpm_keys,
+            rpm_values=rpm_values,
+            messages=messages,
+            input=input,
+        )
--- a/litellm/tests/log.txt
+++ b/litellm/tests/log.txt
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@ -345,6 +345,83 @@ async def test_embedding_caching_azure_individual_items():
    assert embedding_val_2._hidden_params["cache_hit"] == True


+@pytest.mark.asyncio
+async def test_embedding_caching_azure_individual_items_reordered():
+    """
+    Tests caching for individual items in an embedding list
+
+    - Cache an item
+    - call aembedding(..) with the item + 1 unique item
+    - compare to a 2nd aembedding (...) with 2 unique items
+    ```
+    embedding_1 = ["hey how's it going", "I'm doing well"]
+    embedding_val_1 = embedding(...)
+
+    embedding_2 = ["hey how's it going", "I'm fine"]
+    embedding_val_2 = embedding(...)
+
+    assert embedding_val_1[0]["id"] == embedding_val_2[0]["id"]
+    ```
+    """
+    litellm.cache = Cache()
+    common_msg = f"{uuid.uuid4()}"
+    common_msg_2 = f"hey how's it going {uuid.uuid4()}"
+    embedding_1 = [common_msg_2, common_msg]
+    embedding_2 = [
+        common_msg,
+        f"I'm fine {uuid.uuid4()}",
+    ]
+
+    embedding_val_1 = await aembedding(
+        model="azure/azure-embedding-model", input=embedding_1, caching=True
+    )
+    embedding_val_2 = await aembedding(
+        model="azure/azure-embedding-model", input=embedding_2, caching=True
+    )
+    print(f"embedding_val_2._hidden_params: {embedding_val_2._hidden_params}")
+    assert embedding_val_2._hidden_params["cache_hit"] == True
+
+    assert embedding_val_2.data[0]["embedding"] == embedding_val_1.data[1]["embedding"]
+    assert embedding_val_2.data[0]["index"] != embedding_val_1.data[1]["index"]
+    assert embedding_val_2.data[0]["index"] == 0
+    assert embedding_val_1.data[1]["index"] == 1
+
+
+@pytest.mark.asyncio
+async def test_embedding_caching_base_64():
+    """ """
+    litellm.cache = Cache(
+        type="redis",
+        host=os.environ["REDIS_HOST"],
+        port=os.environ["REDIS_PORT"],
+    )
+    import uuid
+
+    inputs = [
+        f"{uuid.uuid4()} hello this is ishaan",
+        f"{uuid.uuid4()} hello this is ishaan again",
+    ]
+
+    embedding_val_1 = await aembedding(
+        model="azure/azure-embedding-model",
+        input=inputs,
+        caching=True,
+        encoding_format="base64",
+    )
+    embedding_val_2 = await aembedding(
+        model="azure/azure-embedding-model",
+        input=inputs,
+        caching=True,
+        encoding_format="base64",
+    )
+
+    assert embedding_val_2._hidden_params["cache_hit"] == True
+    print(embedding_val_2)
+    print(embedding_val_1)
+    assert embedding_val_2.data[0]["embedding"] == embedding_val_1.data[0]["embedding"]
+    assert embedding_val_2.data[1]["embedding"] == embedding_val_1.data[1]["embedding"]
+
+
@pytest.mark.asyncio
 async def test_redis_cache_basic():
    """
@ -630,6 +707,39 @@ async def test_redis_cache_acompletion_stream():
 # test_redis_cache_acompletion_stream()


+@pytest.mark.asyncio
+async def test_redis_cache_atext_completion():
+    try:
+        litellm.set_verbose = True
+        prompt = f"write a one sentence poem about: {uuid.uuid4()}"
+        litellm.cache = Cache(
+            type="redis",
+            host=os.environ["REDIS_HOST"],
+            port=os.environ["REDIS_PORT"],
+            password=os.environ["REDIS_PASSWORD"],
+            supported_call_types=["atext_completion"],
+        )
+        print("test for caching, atext_completion")
+
+        response1 = await litellm.atext_completion(
+            model="gpt-3.5-turbo-instruct", prompt=prompt, max_tokens=40, temperature=1
+        )
+
+        await asyncio.sleep(0.5)
+        print("\n\n Response 1 content: ", response1, "\n\n")
+
+        response2 = await litellm.atext_completion(
+            model="gpt-3.5-turbo-instruct", prompt=prompt, max_tokens=40, temperature=1
+        )
+
+        print(response2)
+
+        assert response1.id == response2.id
+    except Exception as e:
+        print(f"{str(e)}\n\n{traceback.format_exc()}")
+        raise e
+
+
@pytest.mark.asyncio
 async def test_redis_cache_acompletion_stream_bedrock():
    import asyncio
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -596,7 +596,7 @@ def test_completion_gpt4_vision():


 def test_completion_azure_gpt4_vision():
-    # azure/gpt-4, vision takes 5 seconds to respond
+    # azure/gpt-4, vision takes 5-seconds to respond
    try:
        litellm.set_verbose = True
        response = completion(
@ -975,6 +975,19 @@ def test_completion_text_openai():
        pytest.fail(f"Error occurred: {e}")


+@pytest.mark.asyncio
+async def test_completion_text_openai_async():
+    try:
+        # litellm.set_verbose =True
+        response = await litellm.acompletion(
+            model="gpt-3.5-turbo-instruct", messages=messages
+        )
+        print(response["choices"][0]["message"]["content"])
+    except Exception as e:
+        print(e)
+        pytest.fail(f"Error occurred: {e}")
+
+
 def custom_callback(
    kwargs,  # kwargs to completion
    completion_response,  # response from completion
@ -1619,9 +1632,9 @@ def test_completion_replicate_vicuna():

 def test_replicate_custom_prompt_dict():
    litellm.set_verbose = True
-    model_name = "replicate/meta/llama-2-7b-chat"
+    model_name = "replicate/meta/llama-2-70b-chat"
    litellm.register_prompt_template(
-        model="replicate/meta/llama-2-7b-chat",
+        model="replicate/meta/llama-2-70b-chat",
        initial_prompt_value="You are a good assistant",  # [OPTIONAL]
        roles={
            "system": {
@ -1639,16 +1652,24 @@ def test_replicate_custom_prompt_dict():
        },
        final_prompt_value="Now answer as best you can:",  # [OPTIONAL]
    )
-    response = completion(
-        model=model_name,
-        messages=[
-            {
-                "role": "user",
-                "content": "what is yc write 1 paragraph",
-            }
-        ],
-        num_retries=3,
-    )
+    try:
+        response = completion(
+            model=model_name,
+            messages=[
+                {
+                    "role": "user",
+                    "content": "what is yc write 1 paragraph",
+                }
+            ],
+            repetition_penalty=0.1,
+            num_retries=3,
+        )
+    except litellm.APIError as e:
+        pass
+    except litellm.APIConnectionError as e:
+        pass
+    except Exception as e:
+        pytest.fail(f"An exception occurred - {str(e)}")
    print(f"response: {response}")
    litellm.custom_prompt_dict = {}  # reset

--- a/litellm/tests/test_jwt.py
+++ b/litellm/tests/test_jwt.py
@ -345,3 +345,187 @@ async def test_team_token_output(prisma_client):
    assert team_result.team_tpm_limit == 100
    assert team_result.team_rpm_limit == 99
    assert team_result.team_models == ["gpt-3.5-turbo", "gpt-4"]
+
+
+@pytest.mark.asyncio
+async def test_user_token_output(prisma_client):
+    """
+    - If user required, check if it exists
+    - fail initial request (when user doesn't exist)
+    - create user
+    - retry -> it should pass now
+    """
+    import jwt, json
+    from cryptography.hazmat.primitives import serialization
+    from cryptography.hazmat.primitives.asymmetric import rsa
+    from cryptography.hazmat.backends import default_backend
+    from fastapi import Request
+    from starlette.datastructures import URL
+    from litellm.proxy.proxy_server import user_api_key_auth, new_team, new_user
+    from litellm.proxy._types import NewTeamRequest, UserAPIKeyAuth, NewUserRequest
+    import litellm
+    import uuid
+
+    setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
+    await litellm.proxy.proxy_server.prisma_client.connect()
+
+    # Generate a private / public key pair using RSA algorithm
+    key = rsa.generate_private_key(
+        public_exponent=65537, key_size=2048, backend=default_backend()
+    )
+    # Get private key in PEM format
+    private_key = key.private_bytes(
+        encoding=serialization.Encoding.PEM,
+        format=serialization.PrivateFormat.PKCS8,
+        encryption_algorithm=serialization.NoEncryption(),
+    )
+
+    # Get public key in PEM format
+    public_key = key.public_key().public_bytes(
+        encoding=serialization.Encoding.PEM,
+        format=serialization.PublicFormat.SubjectPublicKeyInfo,
+    )
+
+    public_key_obj = serialization.load_pem_public_key(
+        public_key, backend=default_backend()
+    )
+
+    # Convert RSA public key object to JWK (JSON Web Key)
+    public_jwk = json.loads(jwt.algorithms.RSAAlgorithm.to_jwk(public_key_obj))
+
+    assert isinstance(public_jwk, dict)
+
+    # set cache
+    cache = DualCache()
+
+    await cache.async_set_cache(key="litellm_jwt_auth_keys", value=[public_jwk])
+
+    jwt_handler = JWTHandler()
+
+    jwt_handler.user_api_key_cache = cache
+
+    jwt_handler.litellm_jwtauth = LiteLLM_JWTAuth()
+
+    jwt_handler.litellm_jwtauth.user_id_jwt_field = "sub"
+
+    # VALID TOKEN
+    ## GENERATE A TOKEN
+    # Assuming the current time is in UTC
+    expiration_time = int((datetime.utcnow() + timedelta(minutes=10)).timestamp())
+
+    team_id = f"team123_{uuid.uuid4()}"
+    user_id = f"user123_{uuid.uuid4()}"
+    payload = {
+        "sub": user_id,
+        "exp": expiration_time,  # set the token to expire in 10 minutes
+        "scope": "litellm_team",
+        "client_id": team_id,
+    }
+
+    # Generate the JWT token
+    # But before, you should convert bytes to string
+    private_key_str = private_key.decode("utf-8")
+
+    ## team token
+    token = jwt.encode(payload, private_key_str, algorithm="RS256")
+
+    ## admin token
+    payload = {
+        "sub": user_id,
+        "exp": expiration_time,  # set the token to expire in 10 minutes
+        "scope": "litellm_proxy_admin",
+    }
+
+    admin_token = jwt.encode(payload, private_key_str, algorithm="RS256")
+
+    ## VERIFY IT WORKS
+
+    # verify token
+
+    response = await jwt_handler.auth_jwt(token=token)
+
+    ## RUN IT THROUGH USER API KEY AUTH
+
+    """
+    - 1. Initial call should fail -> team doesn't exist
+    - 2. Create team via admin token 
+    - 3. 2nd call w/ same team -> call should fail -> user doesn't exist
+    - 4. Create user via admin token
+    - 5. 3rd call w/ same team, same user -> call should succeed
+    - 6. assert user api key auth format
+    """
+
+    bearer_token = "Bearer " + token
+
+    request = Request(scope={"type": "http"})
+    request._url = URL(url="/chat/completions")
+
+    ## 1. INITIAL TEAM CALL - should fail
+    # use generated key to auth in
+    setattr(litellm.proxy.proxy_server, "general_settings", {"enable_jwt_auth": True})
+    setattr(litellm.proxy.proxy_server, "jwt_handler", jwt_handler)
+    try:
+        result = await user_api_key_auth(request=request, api_key=bearer_token)
+        pytest.fail("Team doesn't exist. This should fail")
+    except Exception as e:
+        pass
+
+    ## 2. CREATE TEAM W/ ADMIN TOKEN - should succeed
+    try:
+        bearer_token = "Bearer " + admin_token
+
+        request._url = URL(url="/team/new")
+        result = await user_api_key_auth(request=request, api_key=bearer_token)
+        await new_team(
+            data=NewTeamRequest(
+                team_id=team_id,
+                tpm_limit=100,
+                rpm_limit=99,
+                models=["gpt-3.5-turbo", "gpt-4"],
+            ),
+            user_api_key_dict=result,
+        )
+    except Exception as e:
+        pytest.fail(f"This should not fail - {str(e)}")
+
+    ## 3. 2nd CALL W/ TEAM TOKEN - should fail
+    bearer_token = "Bearer " + token
+    request._url = URL(url="/chat/completions")
+    try:
+        team_result: UserAPIKeyAuth = await user_api_key_auth(
+            request=request, api_key=bearer_token
+        )
+        pytest.fail(f"User doesn't exist. this should fail")
+    except Exception as e:
+        pass
+
+    ## 4. Create user
+    try:
+        bearer_token = "Bearer " + admin_token
+
+        request._url = URL(url="/team/new")
+        result = await user_api_key_auth(request=request, api_key=bearer_token)
+        await new_user(
+            data=NewUserRequest(
+                user_id=user_id,
+            ),
+        )
+    except Exception as e:
+        pytest.fail(f"This should not fail - {str(e)}")
+
+    ## 5. 3rd call w/ same team, same user -> call should succeed
+    bearer_token = "Bearer " + token
+    request._url = URL(url="/chat/completions")
+    try:
+        team_result: UserAPIKeyAuth = await user_api_key_auth(
+            request=request, api_key=bearer_token
+        )
+    except Exception as e:
+        pytest.fail(f"Team exists. This should not fail - {e}")
+
+    ## 6. ASSERT USER_API_KEY_AUTH format (used for tpm/rpm limiting in parallel_request_limiter.py AND cost tracking)
+
+    assert team_result.team_tpm_limit == 100
+    assert team_result.team_rpm_limit == 99
+    assert team_result.team_models == ["gpt-3.5-turbo", "gpt-4"]
+    assert team_result.user_id == user_id
--- a/litellm/tests/test_key_generate_prisma.py
+++ b/litellm/tests/test_key_generate_prisma.py
@ -66,6 +66,7 @@ from litellm.proxy._types import (
    GenerateKeyRequest,
    NewTeamRequest,
    UserAPIKeyAuth,
+    LiteLLM_UpperboundKeyGenerateParams,
 )
 from litellm.proxy.utils import DBClient
 from starlette.datastructures import URL
@ -1627,10 +1628,9 @@ async def test_upperbound_key_params(prisma_client):
    """
    setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
-    litellm.upperbound_key_generate_params = {
-        "max_budget": 0.001,
-        "budget_duration": "1m",
-    }
+    litellm.upperbound_key_generate_params = LiteLLM_UpperboundKeyGenerateParams(
+        max_budget=0.001, budget_duration="1m"
+    )
    await litellm.proxy.proxy_server.prisma_client.connect()
    try:
        request = GenerateKeyRequest(
@ -1638,18 +1638,9 @@ async def test_upperbound_key_params(prisma_client):
            budget_duration="30d",
        )
        key = await generate_key_fn(request)
-        generated_key = key.key
-
-        result = await info_key_fn(key=generated_key)
-        key_info = result["info"]
-        # assert it used the upper bound for max_budget, and budget_duration
-        assert key_info["max_budget"] == 0.001
-        assert key_info["budget_duration"] == "1m"
-
-        print(result)
+        # print(result)
    except Exception as e:
-        print("Got Exception", e)
-        pytest.fail(f"Got exception {e}")
+        assert e.code == 400


 def test_get_bearer_token():
@ -1686,6 +1677,28 @@ def test_get_bearer_token():
    assert result == "sk-1234", f"Expected 'valid_token', got '{result}'"


+def test_update_logs_with_spend_logs_url(prisma_client):
+    """
+    Unit test for making sure spend logs list is still updated when url passed in
+    """
+    from litellm.proxy.proxy_server import _set_spend_logs_payload
+
+    payload = {"startTime": datetime.now(), "endTime": datetime.now()}
+    _set_spend_logs_payload(payload=payload, prisma_client=prisma_client)
+
+    assert len(prisma_client.spend_log_transactions) > 0
+
+    prisma_client.spend_log_transactions = []
+
+    spend_logs_url = ""
+    payload = {"startTime": datetime.now(), "endTime": datetime.now()}
+    _set_spend_logs_payload(
+        payload=payload, spend_logs_url=spend_logs_url, prisma_client=prisma_client
+    )
+
+    assert len(prisma_client.spend_log_transactions) > 0
+
+
@pytest.mark.asyncio
 async def test_user_api_key_auth(prisma_client):
    from litellm.proxy.proxy_server import ProxyException
--- a/litellm/tests/test_llm_guard.py
+++ b/litellm/tests/test_llm_guard.py
@ -111,7 +111,10 @@ def test_llm_guard_key_specific_mode():
        api_key=_api_key,
    )

-    should_proceed = llm_guard.should_proceed(user_api_key_dict=user_api_key_dict)
+    request_data = {}
+    should_proceed = llm_guard.should_proceed(
+        user_api_key_dict=user_api_key_dict, data=request_data
+    )

    assert should_proceed == False

@ -120,6 +123,46 @@ def test_llm_guard_key_specific_mode():
        api_key=_api_key, permissions={"enable_llm_guard_check": True}
    )

-    should_proceed = llm_guard.should_proceed(user_api_key_dict=user_api_key_dict)
+    request_data = {}
+
+    should_proceed = llm_guard.should_proceed(
+        user_api_key_dict=user_api_key_dict, data=request_data
+    )
+
+    assert should_proceed == True
+
+
+def test_llm_guard_request_specific_mode():
+    """
+    Tests to see if llm guard 'request-specific' permissions work
+    """
+    litellm.llm_guard_mode = "request-specific"
+
+    llm_guard = _ENTERPRISE_LLMGuard(mock_testing=True)
+
+    _api_key = "sk-12345"
+    # NOT ENABLED
+    user_api_key_dict = UserAPIKeyAuth(
+        api_key=_api_key,
+    )
+
+    request_data = {}
+
+    should_proceed = llm_guard.should_proceed(
+        user_api_key_dict=user_api_key_dict, data=request_data
+    )
+
+    assert should_proceed == False
+
+    # ENABLED
+    user_api_key_dict = UserAPIKeyAuth(
+        api_key=_api_key, permissions={"enable_llm_guard_check": True}
+    )
+
+    request_data = {"metadata": {"permissions": {"enable_llm_guard_check": True}}}
+
+    should_proceed = llm_guard.should_proceed(
+        user_api_key_dict=user_api_key_dict, data=request_data
+    )

    assert should_proceed == True
--- a/litellm/tests/test_router.py
+++ b/litellm/tests/test_router.py
@ -398,6 +398,40 @@ async def test_async_router_context_window_fallback():
        pytest.fail(f"Got unexpected exception on router! - {str(e)}")


+def test_router_rpm_pre_call_check():
+    """
+    - for a given model not in model cost map
+    - with rpm set
+    - check if rpm check is run
+    """
+    try:
+        model_list = [
+            {
+                "model_name": "fake-openai-endpoint",  # openai model name
+                "litellm_params": {  # params for litellm completion/embedding call
+                    "model": "openai/my-fake-model",
+                    "api_key": "my-fake-key",
+                    "api_base": "https://openai-function-calling-workers.tasslexyz.workers.dev/",
+                    "rpm": 0,
+                },
+            },
+        ]
+
+        router = Router(model_list=model_list, set_verbose=True, enable_pre_call_checks=True, num_retries=0)  # type: ignore
+
+        try:
+            router._pre_call_checks(
+                model="fake-openai-endpoint",
+                healthy_deployments=model_list,
+                messages=[{"role": "user", "content": "Hey, how's it going?"}],
+            )
+            pytest.fail("Expected this to fail")
+        except:
+            pass
+    except Exception as e:
+        pytest.fail(f"Got unexpected exception on router! - {str(e)}")
+
+
 def test_router_context_window_check_pre_call_check_in_group():
    """
    - Give a gpt-3.5-turbo model group with different context windows (4k vs. 16k)
@ -932,6 +966,35 @@ def test_openai_completion_on_router():
 # test_openai_completion_on_router()


+def test_consistent_model_id():
+    """
+    - For a given model group + litellm params, assert the model id is always the same
+
+    Test on `_generate_model_id`
+
+    Test on `set_model_list`
+
+    Test on `_add_deployment`
+    """
+    model_group = "gpt-3.5-turbo"
+    litellm_params = {
+        "model": "openai/my-fake-model",
+        "api_key": "my-fake-key",
+        "api_base": "https://openai-function-calling-workers.tasslexyz.workers.dev/",
+        "stream_timeout": 0.001,
+    }
+
+    id1 = Router()._generate_model_id(
+        model_group=model_group, litellm_params=litellm_params
+    )
+
+    id2 = Router()._generate_model_id(
+        model_group=model_group, litellm_params=litellm_params
+    )
+
+    assert id1 == id2
+
+
 def test_reading_keys_os_environ():
    import openai

--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@ -831,22 +831,25 @@ def test_bedrock_claude_3_streaming():
        pytest.fail(f"Error occurred: {e}")


-def test_claude_3_streaming_finish_reason():
+@pytest.mark.asyncio
+async def test_claude_3_streaming_finish_reason():
    try:
        litellm.set_verbose = True
        messages = [
            {"role": "system", "content": "Be helpful"},
            {"role": "user", "content": "What do you know?"},
        ]
-        response: ModelResponse = completion(  # type: ignore
+        response: ModelResponse = await litellm.acompletion(  # type: ignore
            model="claude-3-opus-20240229",
            messages=messages,
            stream=True,
+            max_tokens=10,
        )
        complete_response = ""
-        # Add any assertions here to check the response
+        # Add any assertions here to-check the response
        num_finish_reason = 0
-        for idx, chunk in enumerate(response):
+        async for chunk in response:
+            print(f"chunk: {chunk}")
            if isinstance(chunk, ModelResponse):
                if chunk.choices[0].finish_reason is not None:
                    num_finish_reason += 1
@ -2285,7 +2288,7 @@ async def test_acompletion_claude_3_function_call_with_streaming():
            elif chunk.choices[0].finish_reason is not None:  # last chunk
                validate_final_streaming_function_calling_chunk(chunk=chunk)
            idx += 1
-        # raise Exception("it worked!")
+        # raise Exception("it worked! ")
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")

--- a/litellm/types/completion.py
+++ b/litellm/types/completion.py
@ -32,5 +32,5 @@ class CompletionRequest(BaseModel):
    model_list: Optional[List[str]] = None

    class Config:
-        # allow kwargs
        extra = "allow"
+        protected_namespaces = ()      
--- a/litellm/types/router.py
+++ b/litellm/types/router.py
@ -3,7 +3,7 @@ from typing import List, Optional, Union, Dict, Tuple, Literal
 from pydantic import BaseModel, validator
 from .completion import CompletionRequest
 from .embedding import EmbeddingRequest
-import uuid
+import uuid, enum


 class ModelConfig(BaseModel):
@ -12,6 +12,9 @@ class ModelConfig(BaseModel):
    tpm: int
    rpm: int

+    class Config:
+        protected_namespaces = ()
+

 class RouterConfig(BaseModel):
    model_list: List[ModelConfig]
@ -41,6 +44,9 @@ class RouterConfig(BaseModel):
        "latency-based-routing",
    ] = "simple-shuffle"

+    class Config:
+        protected_namespaces = ()
+

 class ModelInfo(BaseModel):
    id: Optional[
@ -127,9 +133,11 @@ class Deployment(BaseModel):
    litellm_params: LiteLLM_Params
    model_info: ModelInfo

-    def __init__(self, model_info: Optional[ModelInfo] = None, **params):
+    def __init__(self, model_info: Optional[Union[ModelInfo, dict]] = None, **params):
        if model_info is None:
            model_info = ModelInfo()
+        elif isinstance(model_info, dict):
+            model_info = ModelInfo(**model_info)
        super().__init__(model_info=model_info, **params)

    def to_json(self, **kwargs):
@ -141,6 +149,7 @@ class Deployment(BaseModel):

    class Config:
        extra = "allow"
+        protected_namespaces = ()

    def __contains__(self, key):
        # Define custom behavior for the 'in' operator
@ -157,3 +166,11 @@ class Deployment(BaseModel):
    def __setitem__(self, key, value):
        # Allow dictionary-style assignment of attributes
        setattr(self, key, value)
+
+
+class RouterErrors(enum.Enum):
+    """
+    Enum for router specific errors with common codes
+    """
+
+    user_defined_ratelimit_error = "Deployment over user-defined ratelimit."
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -20,6 +20,7 @@ import datetime, time
 import tiktoken
 import uuid
 import aiohttp
+import textwrap
 import logging
 import asyncio, httpx, inspect
 from inspect import iscoroutine
@ -236,6 +237,7 @@ class HiddenParams(OpenAIObject):

    class Config:
        extra = "allow"
+        protected_namespaces = ()

    def get(self, key, default=None):
        # Custom .get() method to access attributes with a default value if the attribute doesn't exist
@ -605,7 +607,7 @@ class ModelResponse(OpenAIObject):


 class Embedding(OpenAIObject):
-    embedding: list = []
+    embedding: Union[list, str] = []
    index: int
    object: str

@ -1104,7 +1106,6 @@ class Logging:
                curl_command = self.model_call_details

            # only print verbose if verbose logger is not set
-
            if verbose_logger.level == 0:
                # this means verbose logger was not switched on - user is in litellm.set_verbose=True
                print_verbose(f"\033[92m{curl_command}\033[0m\n")
@ -1989,9 +1990,6 @@ class Logging:
                            else:
                                litellm.cache.add_cache(result, **kwargs)
                if isinstance(callback, CustomLogger):  # custom logger class
-                    print_verbose(
-                        f"Running Async success callback: {callback}; self.stream: {self.stream}; async_complete_streaming_response: {self.model_call_details.get('async_complete_streaming_response', None)} result={result}"
-                    )
                    if self.stream == True:
                        if (
                            "async_complete_streaming_response"
@ -2375,7 +2373,6 @@ def client(original_function):
            if litellm.use_client or (
                "use_client" in kwargs and kwargs["use_client"] == True
            ):
-                print_verbose(f"litedebugger initialized")
                if "lite_debugger" not in litellm.input_callback:
                    litellm.input_callback.append("lite_debugger")
                if "lite_debugger" not in litellm.success_callback:
@ -2999,7 +2996,7 @@ def client(original_function):
                )
            ):  # allow users to control returning cached responses from the completion function
                # checking cache
-                print_verbose(f"INSIDE CHECKING CACHE")
+                print_verbose("INSIDE CHECKING CACHE")
                if (
                    litellm.cache is not None
                    and str(original_function.__name__)
@ -3106,6 +3103,22 @@ def client(original_function):
                                    response_object=cached_result,
                                    model_response_object=ModelResponse(),
                                )
+                        if (
+                            call_type == CallTypes.atext_completion.value
+                            and isinstance(cached_result, dict)
+                        ):
+                            if kwargs.get("stream", False) == True:
+                                cached_result = convert_to_streaming_response_async(
+                                    response_object=cached_result,
+                                )
+                                cached_result = CustomStreamWrapper(
+                                    completion_stream=cached_result,
+                                    model=model,
+                                    custom_llm_provider="cached_response",
+                                    logging_obj=logging_obj,
+                                )
+                            else:
+                                cached_result = TextCompletionResponse(**cached_result)
                        elif call_type == CallTypes.aembedding.value and isinstance(
                            cached_result, dict
                        ):
@ -3174,7 +3187,13 @@ def client(original_function):
                            for val in non_null_list:
                                idx, cr = val  # (idx, cr) tuple
                                if cr is not None:
-                                    final_embedding_cached_response.data[idx] = cr
+                                    final_embedding_cached_response.data[idx] = (
+                                        Embedding(
+                                            embedding=cr["embedding"],
+                                            index=idx,
+                                            object="embedding",
+                                        )
+                                    )
                        if len(remaining_list) == 0:
                            # LOG SUCCESS
                            cache_hit = True
@ -4837,8 +4856,17 @@ def get_optional_params(
            optional_params["top_p"] = top_p
        if stream:
            optional_params["stream"] = stream
+        if n is not None:
+            optional_params["candidate_count"] = n
+        if stop is not None:
+            if isinstance(stop, str):
+                optional_params["stop_sequences"] = [stop]
+            elif isinstance(stop, list):
+                optional_params["stop_sequences"] = stop
        if max_tokens is not None:
            optional_params["max_output_tokens"] = max_tokens
+        if response_format is not None and response_format["type"] == "json_object":
+            optional_params["response_mime_type"] = "application/json"
        if tools is not None and isinstance(tools, list):
            from vertexai.preview import generative_models

@ -5525,6 +5553,9 @@ def get_supported_openai_params(model: str, custom_llm_provider: str):
            "stream",
            "tools",
            "tool_choice",
+            "response_format",
+            "n",
+            "stop",
        ]
    elif custom_llm_provider == "sagemaker":
        return ["stream", "temperature", "max_tokens", "top_p", "stop", "n"]
@ -5905,6 +5936,16 @@ def get_api_key(llm_provider: str, dynamic_api_key: Optional[str]):
    return api_key


+def get_utc_datetime():
+    import datetime as dt
+    from datetime import datetime
+
+    if hasattr(dt, "UTC"):
+        return datetime.now(dt.UTC)  # type: ignore
+    else:
+        return datetime.utcnow()  # type: ignore
+
+
 def get_max_tokens(model: str):
    """
    Get the maximum number of output tokens allowed for a given model.
@ -6523,8 +6564,9 @@ def handle_failure(exception, traceback_exception, start_time, end_time, args, k
                    for detail in additional_details:
                        slack_msg += f"{detail}: {additional_details[detail]}\n"
                    slack_msg += f"Traceback: {traceback_exception}"
+                    truncated_slack_msg = textwrap.shorten(slack_msg, width=512, placeholder="...")
                    slack_app.client.chat_postMessage(
-                        channel=alerts_channel, text=slack_msg
+                        channel=alerts_channel, text=truncated_slack_msg
                    )
                elif callback == "sentry":
                    capture_exception(exception)
@ -7741,7 +7783,7 @@ def exception_type(
                    )
                elif (
                    "429 Quota exceeded" in error_str
-                    or "IndexError: list index out of range"
+                    or "IndexError: list index out of range" in error_str
                ):
                    exception_mapping_worked = True
                    raise RateLimitError(
@ -8764,7 +8806,9 @@ class CustomStreamWrapper:
        return hold, curr_chunk

    def handle_anthropic_chunk(self, chunk):
-        str_line = chunk.decode("utf-8")  # Convert bytes to string
+        str_line = chunk
+        if isinstance(chunk, bytes):  # Handle binary data
+            str_line = chunk.decode("utf-8")  # Convert bytes to string
        text = ""
        is_finished = False
        finish_reason = None
@ -10024,6 +10068,7 @@ class CustomStreamWrapper:
                or self.custom_llm_provider == "custom_openai"
                or self.custom_llm_provider == "text-completion-openai"
                or self.custom_llm_provider == "azure_text"
+                or self.custom_llm_provider == "anthropic"
                or self.custom_llm_provider == "huggingface"
                or self.custom_llm_provider == "ollama"
                or self.custom_llm_provider == "ollama_chat"
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -66,6 +66,28 @@
        "litellm_provider": "openai",
        "mode": "chat"
    },
+    "gpt-4-turbo": {
+        "max_tokens": 4096,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00001,
+        "output_cost_per_token": 0.00003,
+        "litellm_provider": "openai",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "supports_parallel_function_calling": true
+    },
+    "gpt-4-turbo-2024-04-09": {
+        "max_tokens": 4096,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00001,
+        "output_cost_per_token": 0.00003,
+        "litellm_provider": "openai",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "supports_parallel_function_calling": true
+    },
    "gpt-4-1106-preview": {
        "max_tokens": 4096,
        "max_input_tokens": 128000,
@ -948,6 +970,28 @@
        "supports_function_calling": true,
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
    },
+    "gemini-1.0-pro-001": { 
+        "max_tokens": 8192,
+        "max_input_tokens": 32760,
+        "max_output_tokens": 8192,
+        "input_cost_per_token": 0.00000025, 
+        "output_cost_per_token": 0.0000005,
+        "litellm_provider": "vertex_ai-language-models",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
+    },
+    "gemini-1.0-pro-002": { 
+        "max_tokens": 8192,
+        "max_input_tokens": 32760,
+        "max_output_tokens": 8192,
+        "input_cost_per_token": 0.00000025, 
+        "output_cost_per_token": 0.0000005,
+        "litellm_provider": "vertex_ai-language-models",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
+    },
    "gemini-1.5-pro": { 
        "max_tokens": 8192,
        "max_input_tokens": 1000000,
@ -970,6 +1014,17 @@
        "supports_function_calling": true,
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
    },
+    "gemini-1.5-pro-preview-0409": {
+        "max_tokens": 8192,
+        "max_input_tokens": 1000000,
+        "max_output_tokens": 8192,
+        "input_cost_per_token": 0,
+        "output_cost_per_token": 0,
+        "litellm_provider": "vertex_ai-language-models",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
+    },
    "gemini-experimental": {
        "max_tokens": 8192,
        "max_input_tokens": 1000000,
@ -2808,6 +2863,46 @@
        "output_cost_per_token": 0.000000,
        "litellm_provider": "voyage",
        "mode": "embedding"
+    },
+    "voyage/voyage-large-2": {
+        "max_tokens": 16000,
+        "max_input_tokens": 16000,
+        "input_cost_per_token": 0.00000012,
+        "output_cost_per_token": 0.000000,
+        "litellm_provider": "voyage",
+        "mode": "embedding"
+    },
+    "voyage/voyage-law-2": {
+        "max_tokens": 16000,
+        "max_input_tokens": 16000,
+        "input_cost_per_token": 0.00000012,
+        "output_cost_per_token": 0.000000,
+        "litellm_provider": "voyage",
+        "mode": "embedding"
+    },
+    "voyage/voyage-code-2": {
+        "max_tokens": 16000,
+        "max_input_tokens": 16000,
+        "input_cost_per_token": 0.00000012,
+        "output_cost_per_token": 0.000000,
+        "litellm_provider": "voyage",
+        "mode": "embedding"
+    },
+    "voyage/voyage-2": {
+        "max_tokens": 4000,
+        "max_input_tokens": 4000,
+        "input_cost_per_token": 0.0000001,
+        "output_cost_per_token": 0.000000,
+        "litellm_provider": "voyage",
+        "mode": "embedding"
+    },
+    "voyage/voyage-lite-02-instruct": {
+        "max_tokens": 4000,
+        "max_input_tokens": 4000,
+        "input_cost_per_token": 0.0000001,
+        "output_cost_per_token": 0.000000,
+        "litellm_provider": "voyage",
+        "mode": "embedding"
    }

 }
--- a/proxy_server_config.yaml
+++ b/proxy_server_config.yaml
@ -48,7 +48,16 @@ model_list:
      model: openai/fake
      api_key: fake-key
      api_base: https://exampleopenaiendpoint-production.up.railway.app/
-
+  - model_name: fake-openai-endpoint-2
+    litellm_params:
+      model: openai/my-fake-model
+      api_key: my-fake-key
+      api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
+      stream_timeout: 0.001
+      rpm: 1
+  - model_name: gpt-instruct # [PROD TEST] - tests if `/health` automatically infers this to be a text completion model
+    litellm_params:
+      model: text-completion-openai/gpt-3.5-turbo-instruct
 litellm_settings:
  drop_params: True
  # max_budget: 100 
@ -58,6 +67,13 @@ litellm_settings:
  telemetry: False
  context_window_fallbacks: [{"gpt-3.5-turbo": ["gpt-3.5-turbo-large"]}]

+router_settings:
+  routing_strategy: usage-based-routing-v2 
+  redis_host: os.environ/REDIS_HOST
+  redis_password: os.environ/REDIS_PASSWORD
+  redis_port: os.environ/REDIS_PORT
+  enable_pre_call_checks: true
+
 general_settings: 
  master_key: sk-1234 # [OPTIONAL] Use to enforce auth on proxy. See - https://docs.litellm.ai/docs/proxy/virtual_keys
  store_model_in_db: True
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.34.33"
+version = "1.35.4"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@ -80,7 +80,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"

 [tool.commitizen]
-version = "1.34.33"
+version = "1.35.4"
 version_files = [
    "pyproject.toml:^version"
 ]
--- a/requirements.txt
+++ b/requirements.txt
@ -14,9 +14,9 @@ pandas==2.1.1 # for viewing clickhouse spend analytics
 prisma==0.11.0 # for db
 mangum==0.17.0 # for aws lambda functions
 pynacl==1.5.0 # for encrypting keys
-google-cloud-aiplatform==1.43.0 # for vertex ai calls
+google-cloud-aiplatform==1.47.0 # for vertex ai calls
 anthropic[vertex]==0.21.3
-google-generativeai==0.3.2 # for vertex ai calls
+google-generativeai==0.5.0 # for vertex ai calls
 async_generator==1.10.0 # for async ollama calls
 langfuse>=2.6.3 # for langfuse self-hosted logging
 datadog-api-client==2.23.0 # for datadog logging
--- a/schema.prisma
+++ b/schema.prisma
@ -53,6 +53,7 @@ model LiteLLM_OrganizationTable {
    updated_by String
    litellm_budget_table LiteLLM_BudgetTable?   @relation(fields: [budget_id], references: [budget_id])
    teams LiteLLM_TeamTable[] 
+    users LiteLLM_UserTable[]
 }

 // Model info for teams, just has model aliases for now.
@ -99,6 +100,7 @@ model LiteLLM_UserTable {
 		user_id    String @id
    user_alias String? 
    team_id    String?
+    organization_id String?
    teams    String[] @default([])
    user_role  String?
 		max_budget Float?
@ -113,6 +115,7 @@ model LiteLLM_UserTable {
    allowed_cache_controls String[] @default([])
    model_spend      Json @default("{}")
    model_max_budget Json @default("{}")
+    litellm_organization_table LiteLLM_OrganizationTable?   @relation(fields: [organization_id], references: [organization_id])
 }

 // Generate Tokens for Proxy
--- a/tests/test_models.py
+++ b/tests/test_models.py
@ -127,20 +127,6 @@ async def chat_completion(session, key):
            raise Exception(f"Request did not return a 200 status code: {status}")


-@pytest.mark.asyncio
-async def test_add_models():
-    """
-    Add model
-    Call new model
-    """
-    async with aiohttp.ClientSession() as session:
-        key_gen = await generate_key(session=session)
-        key = key_gen["key"]
-        await add_models(session=session)
-        await asyncio.sleep(60)
-        await chat_completion(session=session, key=key)
-
-
@pytest.mark.asyncio
 async def test_get_models():
    """
@ -178,14 +164,15 @@ async def delete_model(session, model_id="123"):


@pytest.mark.asyncio
-async def test_delete_models():
+async def test_add_and_delete_models():
    """
-    Get models user has access to
+    Add model
+    Call new model
    """
-    model_id = "12345"
    async with aiohttp.ClientSession() as session:
        key_gen = await generate_key(session=session)
        key = key_gen["key"]
+        model_id = "1234"
        await add_models(session=session, model_id=model_id)
        await asyncio.sleep(60)
        await chat_completion(session=session, key=key)
--- a/tests/test_openai_endpoints.py
+++ b/tests/test_openai_endpoints.py
@ -18,7 +18,12 @@ async def generate_key(session):
    url = "http://0.0.0.0:4000/key/generate"
    headers = {"Authorization": "Bearer sk-1234", "Content-Type": "application/json"}
    data = {
-        "models": ["gpt-4", "text-embedding-ada-002", "dall-e-2"],
+        "models": [
+            "gpt-4",
+            "text-embedding-ada-002",
+            "dall-e-2",
+            "fake-openai-endpoint-2",
+        ],
        "duration": None,
    }

@ -63,14 +68,14 @@ async def new_user(session):
        return await response.json()


-async def chat_completion(session, key):
+async def chat_completion(session, key, model="gpt-4"):
    url = "http://0.0.0.0:4000/chat/completions"
    headers = {
        "Authorization": f"Bearer {key}",
        "Content-Type": "application/json",
    }
    data = {
-        "model": "gpt-4",
+        "model": model,
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "Hello!"},
@ -189,6 +194,31 @@ async def test_chat_completion():
        await chat_completion(session=session, key=key_2)


+# @pytest.mark.skip(reason="Local test. Proxy not concurrency safe yet. WIP.")
+@pytest.mark.asyncio
+async def test_chat_completion_ratelimit():
+    """
+    - call model with rpm 1
+    - make 2 parallel calls
+    - make sure 1 fails
+    """
+    async with aiohttp.ClientSession() as session:
+        # key_gen = await generate_key(session=session)
+        key = "sk-1234"
+        tasks = []
+        tasks.append(
+            chat_completion(session=session, key=key, model="fake-openai-endpoint-2")
+        )
+        tasks.append(
+            chat_completion(session=session, key=key, model="fake-openai-endpoint-2")
+        )
+        try:
+            await asyncio.gather(*tasks)
+            pytest.fail("Expected at least 1 call to fail")
+        except Exception as e:
+            pass
+
+
@pytest.mark.asyncio
 async def test_chat_completion_old_key():
    """
--- a/ui/litellm-dashboard/out/404.html
+++ b/ui/litellm-dashboard/out/404.html
--- a/ui/litellm-dashboard/out/_next/static/chunks/webpack-11b043d6a7ef78fa.js
+++ b/ui/litellm-dashboard/out/_next/static/chunks/webpack-11b043d6a7ef78fa.js
@ -0,0 +1 @@
+!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function d(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={id:e,loaded:!1,exports:{}},r=!0;try{a[e](n,n.exports,d),r=!1}finally{r&&delete l[e]}return n.loaded=!0,n.exports}d.m=a,e=[],d.O=function(t,n,r,o){if(n){o=o||0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(d.O).every(function(e){return d.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},d.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return d.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},d.t=function(e,r){if(1&r&&(e=this(e)),8&r||"object"==typeof e&&e&&(4&r&&e.__esModule||16&r&&"function"==typeof e.then))return e;var o=Object.create(null);d.r(o);var u={};t=t||[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},d.d(o,u),o},d.d=function(e,t){for(var n in t)d.o(t,n)&&!d.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},d.f={},d.e=function(e){return Promise.all(Object.keys(d.f).reduce(function(t,n){return d.f[n](e,t),t},[]))},d.u=function(e){},d.miniCssF=function(e){return"static/css/a282d1bfd6ed4df8.css"},d.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||Function("return this")()}catch(e){if("object"==typeof window)return window}}(),d.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",d.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e||l.getAttribute("data-webpack")==o+n){i=l;break}}i||(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,d.nc&&i.setAttribute("nonce",d.nc),i.setAttribute("data-webpack",o+n),i.src=d.tu(e)),r[e]=[t];var s=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(s.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=s.bind(null,i.onerror),i.onload=s.bind(null,i.onload),c&&document.head.appendChild(i)},d.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.nmd=function(e){return e.paths=[],e.children||(e.children=[]),e},d.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},d.tu=function(e){return d.tt().createScriptURL(e)},d.p="/ui/_next/",i={272:0},d.f.j=function(e,t){var n=d.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=d.p+d.u(e),u=Error();d.l(o,function(t){if(d.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},d.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)d.o(u,n)&&(d.m[n]=u[n]);if(c)var a=c(d)}for(e&&e(t);f<o.length;f++)r=o[f],d.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return d.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E||[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();
--- a/ui/litellm-dashboard/out/_next/static/css/a282d1bfd6ed4df8.css
+++ b/ui/litellm-dashboard/out/_next/static/css/a282d1bfd6ed4df8.css
--- a/ui/litellm-dashboard/out/index.html
+++ b/ui/litellm-dashboard/out/index.html
@ -1 +1 @@
-<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-68f14392aea51f63.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-a507ee9e75a3be72.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-589b47e7a69d316f.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-096338c8e1915716.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-68f14392aea51f63.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/04eb0ce8764f86fe.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[46502,[\"253\",\"static/chunks/253-8ab6133ad5f92675.js\",\"931\",\"static/chunks/app/page-a485c9c659128852.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/04eb0ce8764f86fe.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"KnyD0lgLk9_a0erHwSSu-\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
+<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-11b043d6a7ef78fa.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-a507ee9e75a3be72.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-589b47e7a69d316f.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-096338c8e1915716.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-11b043d6a7ef78fa.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/a282d1bfd6ed4df8.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[29306,[\"823\",\"static/chunks/823-2ada48e2e6a5ab39.js\",\"931\",\"static/chunks/app/page-e16bcf8bdc356530.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/a282d1bfd6ed4df8.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"BNBzATtnAelV8BpmzRdfL\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
--- a/ui/litellm-dashboard/out/index.txt
+++ b/ui/litellm-dashboard/out/index.txt
@ -1,7 +1,7 @@
 2:I[77831,[],""]
-3:I[46502,["253","static/chunks/253-8ab6133ad5f92675.js","931","static/chunks/app/page-a485c9c659128852.js"],""]
+3:I[29306,["823","static/chunks/823-2ada48e2e6a5ab39.js","931","static/chunks/app/page-e16bcf8bdc356530.js"],""]
 4:I[5613,[],""]
 5:I[31778,[],""]
-0:["KnyD0lgLk9_a0erHwSSu-",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/04eb0ce8764f86fe.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
+0:["BNBzATtnAelV8BpmzRdfL",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/a282d1bfd6ed4df8.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
 6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
 1:null
--- a/ui/litellm-dashboard/src/app/page.tsx
+++ b/ui/litellm-dashboard/src/app/page.tsx
@ -7,6 +7,7 @@ import ModelDashboard from "@/components/model_dashboard";
 import ViewUserDashboard from "@/components/view_users";
 import Teams from "@/components/teams";
 import AdminPanel from "@/components/admins";
+import Settings from "@/components/settings";
 import ChatUI from "@/components/chat_ui";
 import Sidebar from "../components/leftnav";
 import Usage from "../components/usage";
@ -160,6 +161,13 @@ const CreateKeyPage = () => {
              setTeams={setTeams}
              searchParams={searchParams}
              accessToken={accessToken}
+              showSSOBanner={showSSOBanner}
+            />
+          ) : page == "settings" ? (
+            <Settings
+              userID={userID}
+              userRole={userRole}
+              accessToken={accessToken}
            />
          ) : (
            <Usage
--- a/ui/litellm-dashboard/src/components/admins.tsx
+++ b/ui/litellm-dashboard/src/components/admins.tsx
@ -27,23 +27,27 @@ import {
  Col,
  Text,
  Grid,
+  Callout,
 } from "@tremor/react";
-import { CogIcon } from "@heroicons/react/outline";
+import { PencilAltIcon } from "@heroicons/react/outline";
 interface AdminPanelProps {
  searchParams: any;
  accessToken: string | null;
  setTeams: React.Dispatch<React.SetStateAction<Object[] | null>>;
+  showSSOBanner: boolean;
 }
 import {
  userUpdateUserCall,
  Member,
  userGetAllUsersCall,
  User,
+  setCallbacksCall,
 } from "./networking";

 const AdminPanel: React.FC<AdminPanelProps> = ({
  searchParams,
  accessToken,
+  showSSOBanner
 }) => {
  const [form] = Form.useForm();
  const [memberForm] = Form.useForm();
@ -52,6 +56,47 @@ const AdminPanel: React.FC<AdminPanelProps> = ({
  const [admins, setAdmins] = useState<null | any[]>(null);

  const [isAddMemberModalVisible, setIsAddMemberModalVisible] = useState(false);
+  const [isAddAdminModalVisible, setIsAddAdminModalVisible] = useState(false);
+  const [isUpdateMemberModalVisible, setIsUpdateModalModalVisible] = useState(false);
+  const [isAddSSOModalVisible, setIsAddSSOModalVisible] = useState(false);
+  const [isInstructionsModalVisible, setIsInstructionsModalVisible] = useState(false);
+
+  let nonSssoUrl;
+  try {
+    nonSssoUrl = window.location.origin;
+  } catch (error) {
+    nonSssoUrl  = '<your-proxy-url>';
+  }
+  nonSssoUrl += '/fallback/login';
+
+const handleAddSSOOk = () => {
+  
+  setIsAddSSOModalVisible(false);
+  form.resetFields();
+};
+
+const handleAddSSOCancel = () => {
+  setIsAddSSOModalVisible(false);
+  form.resetFields();
+};
+
+const handleShowInstructions = (formValues: Record<string, any>) => {
+  handleAdminCreate(formValues);
+  handleSSOUpdate(formValues);
+  setIsAddSSOModalVisible(false);
+  setIsInstructionsModalVisible(true);
+  // Optionally, you can call handleSSOUpdate here with the formValues
+};
+
+const handleInstructionsOk = () => {
+  setIsInstructionsModalVisible(false);
+};
+
+const handleInstructionsCancel = () => {
+  setIsInstructionsModalVisible(false);
+};
+
+  const roles = ["proxy_admin", "proxy_admin_viewer"]

  useEffect(() => {
    // Fetch model info and set the default selected model
@ -94,26 +139,138 @@ const AdminPanel: React.FC<AdminPanelProps> = ({
    fetchProxyAdminInfo();
  }, [accessToken]);

+  const handleMemberUpdateOk = () => {
+    setIsUpdateModalModalVisible(false);
+    memberForm.resetFields();
+  };
+
  const handleMemberOk = () => {
    setIsAddMemberModalVisible(false);
    memberForm.resetFields();
  };

+  const handleAdminOk = () => {
+    setIsAddAdminModalVisible(false);
+    memberForm.resetFields();
+  };
+
  const handleMemberCancel = () => {
    setIsAddMemberModalVisible(false);
    memberForm.resetFields();
  };

+  const handleAdminCancel = () => {
+    setIsAddAdminModalVisible(false);
+    memberForm.resetFields();
+  };
+
+  const handleMemberUpdateCancel = () => {
+    setIsUpdateModalModalVisible(false);
+    memberForm.resetFields();
+  }
+  // Define the type for the handleMemberCreate function
+  type HandleMemberCreate = (formValues: Record<string, any>) => Promise<void>;
+
+  const addMemberForm = (handleMemberCreate: HandleMemberCreate,) => {
+    return <Form
+    form={form}
+    onFinish={handleMemberCreate}
+    labelCol={{ span: 8 }}
+    wrapperCol={{ span: 16 }}
+    labelAlign="left"
+  >
+    <>
+      <Form.Item label="Email" name="user_email" className="mb-4">
+        <Input
+          name="user_email"
+          className="px-3 py-2 border rounded-md w-full"
+        />
+      </Form.Item>
+      <div className="text-center mb-4">OR</div>
+      <Form.Item label="User ID" name="user_id" className="mb-4">
+        <Input
+          name="user_id"
+          className="px-3 py-2 border rounded-md w-full"
+        />
+      </Form.Item>
+    </>
+    <div style={{ textAlign: "right", marginTop: "10px" }}>
+      <Button2 htmlType="submit">Add member</Button2>
+    </div>
+  </Form>
+  }
+
+  const modifyMemberForm = (handleMemberUpdate: HandleMemberCreate, currentRole: string, userID: string) => {
+    return <Form
+    form={form}
+    onFinish={handleMemberUpdate}
+    labelCol={{ span: 8 }}
+    wrapperCol={{ span: 16 }}
+    labelAlign="left"
+  >
+    <>
+    <Form.Item rules={[{ required: true, message: 'Required' }]} label="User Role" name="user_role" labelCol={{ span: 10 }} labelAlign="left">
+        <Select value={currentRole}>
+          {roles.map((role, index) => (
+              <SelectItem
+                key={index}
+                value={role}
+              >
+                {role}
+              </SelectItem>
+            ))}
+        </Select>
+      </Form.Item>
+      <Form.Item
+        label="Team ID"
+        name="user_id"
+        hidden={true}
+        initialValue={userID}
+        valuePropName="user_id"
+        className="mt-8"
+      >
+        <Input value={userID} disabled />
+      </Form.Item>
+    </>
+    <div style={{ textAlign: "right", marginTop: "10px" }}>
+      <Button2 htmlType="submit">Update role</Button2>
+    </div>
+  </Form>
+  }
+
+  const handleMemberUpdate = async (formValues: Record<string, any>) => {
+    try{
+      if (accessToken != null && admins != null) {
+        message.info("Making API Call");
+        const response: any = await userUpdateUserCall(accessToken, formValues, null);
+        console.log(`response for team create call: ${response}`);
+        // Checking if the team exists in the list and updating or adding accordingly
+        const foundIndex = admins.findIndex((user) => {
+          console.log(
+            `user.user_id=${user.user_id}; response.user_id=${response.user_id}`
+          );
+          return user.user_id === response.user_id;
+        });
+        console.log(`foundIndex: ${foundIndex}`);
+        if (foundIndex == -1) {
+          console.log(`updates admin with new user`);
+          admins.push(response);
+          // If new user is found, update it
+          setAdmins(admins); // Set the new state
+        } 
+        message.success("Refresh tab to see updated user role")
+        setIsUpdateModalModalVisible(false);
+      }
+    } catch (error) {
+      console.error("Error creating the key:", error);
+    }
+  }
+
  const handleMemberCreate = async (formValues: Record<string, any>) => {
    try {
      if (accessToken != null && admins != null) {
        message.info("Making API Call");
-        const user_role: Member = {
-          role: "user",
-          user_email: formValues.user_email,
-          user_id: formValues.user_id,
-        };
-        const response: any = await userUpdateUserCall(accessToken, formValues);
+        const response: any = await userUpdateUserCall(accessToken, formValues, "proxy_admin_viewer");
        console.log(`response for team create call: ${response}`);
        // Checking if the team exists in the list and updating or adding accordingly
        const foundIndex = admins.findIndex((user) => {
@ -135,18 +292,66 @@ const AdminPanel: React.FC<AdminPanelProps> = ({
      console.error("Error creating the key:", error);
    }
  };
+  const handleAdminCreate = async (formValues: Record<string, any>) => {
+    try {
+      if (accessToken != null && admins != null) {
+        message.info("Making API Call");
+        const user_role: Member = {
+          role: "user",
+          user_email: formValues.user_email,
+          user_id: formValues.user_id,
+        };
+        const response: any = await userUpdateUserCall(accessToken, formValues, "proxy_admin");
+        console.log(`response for team create call: ${response}`);
+        // Checking if the team exists in the list and updating or adding accordingly
+        const foundIndex = admins.findIndex((user) => {
+          console.log(
+            `user.user_id=${user.user_id}; response.user_id=${response.user_id}`
+          );
+          return user.user_id === response.user_id;
+        });
+        console.log(`foundIndex: ${foundIndex}`);
+        if (foundIndex == -1) {
+          console.log(`updates admin with new user`);
+          admins.push(response);
+          // If new user is found, update it
+          setAdmins(admins); // Set the new state
+        }
+        setIsAddAdminModalVisible(false);
+      }
+    } catch (error) {
+      console.error("Error creating the key:", error);
+    }
+  };
+
+  const handleSSOUpdate = async (formValues: Record<string, any>) => {
+    if (accessToken == null) {
+      return;
+    }
+    let payload = {
+      environment_variables: {
+        PROXY_BASE_URL: formValues.proxy_base_url,
+        GOOGLE_CLIENT_ID: formValues.google_client_id,
+        GOOGLE_CLIENT_SECRET: formValues.google_client_secret,
+      },
+    };
+    setCallbacksCall(accessToken, payload);
+  }
  console.log(`admins: ${admins?.length}`);
  return (
    <div className="w-full m-2 mt-2 p-8">
-      <Title level={4}>Restricted Access</Title>
+      <Title level={4}>Admin Access </Title>
      <Paragraph>
-        Add other people to just view spend. They cannot create keys, teams or
+        {
+          showSSOBanner && <a href="https://docs.litellm.ai/docs/proxy/ui#restrict-ui-access">Requires SSO Setup</a>
+        }
+        <br/>
+        <b>Proxy Admin: </b> Can create keys, teams, users, add models, etc. <br/>
+        <b>Proxy Admin Viewer: </b>Can just view spend. They cannot create keys, teams or
        grant users access to new models.{" "}
-        <a href="https://docs.litellm.ai/docs/proxy/ui#restrict-ui-access">
-          Requires SSO Setup
-        </a>
      </Paragraph>
      <Grid numItems={1} className="gap-2 p-2 w-full">
+        
        <Col numColSpan={1}>
          <Card className="w-full mx-auto flex-auto overflow-y-auto max-h-[50vh]">
            <Table>
@ -154,7 +359,6 @@ const AdminPanel: React.FC<AdminPanelProps> = ({
                <TableRow>
                  <TableHeaderCell>Member Name</TableHeaderCell>
                  <TableHeaderCell>Role</TableHeaderCell>
-                  {/* <TableHeaderCell>Action</TableHeaderCell> */}
                </TableRow>
              </TableHead>

@ -170,9 +374,18 @@ const AdminPanel: React.FC<AdminPanelProps> = ({
                            : null}
                        </TableCell>
                        <TableCell>{member["user_role"]}</TableCell>
-                        {/* <TableCell>
-                          <Icon icon={CogIcon} size="sm" />
-                        </TableCell> */}
+                        <TableCell>
+                          <Icon icon={PencilAltIcon} size="sm" onClick={() => setIsUpdateModalModalVisible(true)}/>
+                          <Modal
+                            title="Update role"
+                            visible={isUpdateMemberModalVisible}
+                            width={800}
+                            footer={null}
+                            onOk={handleMemberUpdateOk}
+                            onCancel={handleMemberUpdateCancel}>
+                            {modifyMemberForm(handleMemberUpdate, member["user_role"], member["user_id"])}
+                          </Modal>
+                        </TableCell>
                      </TableRow>
                    ))
                  : null}
@ -181,11 +394,27 @@ const AdminPanel: React.FC<AdminPanelProps> = ({
          </Card>
        </Col>
        <Col numColSpan={1}>
+        <div className="flex justify-start">
          <Button
-            className="mx-auto mb-5"
-            onClick={() => setIsAddMemberModalVisible(true)}
+              className="mr-4 mb-5"
+              onClick={() => setIsAddAdminModalVisible(true)}
          >
-            + Add viewer
+              + Add admin
+          </Button>
+          <Modal
+            title="Add admin"
+            visible={isAddAdminModalVisible}
+            width={800}
+            footer={null}
+            onOk={handleAdminOk}
+            onCancel={handleAdminCancel}>
+            {addMemberForm(handleAdminCreate)}
+          </Modal>
+          <Button
+              className="mb-5"
+              onClick={() => setIsAddMemberModalVisible(true)}
+          >
+              + Add viewer
          </Button>
          <Modal
            title="Add viewer"
@ -195,35 +424,99 @@ const AdminPanel: React.FC<AdminPanelProps> = ({
            onOk={handleMemberOk}
            onCancel={handleMemberCancel}
          >
-            <Form
-              form={form}
-              onFinish={handleMemberCreate}
-              labelCol={{ span: 8 }}
-              wrapperCol={{ span: 16 }}
-              labelAlign="left"
-            >
-              <>
-                <Form.Item label="Email" name="user_email" className="mb-4">
-                  <Input
-                    name="user_email"
-                    className="px-3 py-2 border rounded-md w-full"
-                  />
-                </Form.Item>
-                <div className="text-center mb-4">OR</div>
-                <Form.Item label="User ID" name="user_id" className="mb-4">
-                  <Input
-                    name="user_id"
-                    className="px-3 py-2 border rounded-md w-full"
-                  />
-                </Form.Item>
-              </>
-              <div style={{ textAlign: "right", marginTop: "10px" }}>
-                <Button2 htmlType="submit">Add member</Button2>
-              </div>
-            </Form>
+            {addMemberForm(handleMemberCreate)}
          </Modal>
+          </div>
        </Col>
      </Grid>
+      <Grid>
+  <Title level={4}>Add SSO</Title>
+  <div className="flex justify-start mb-4">
+    <Button onClick={() => setIsAddSSOModalVisible(true)}>Add SSO</Button>
+    <Modal
+      title="Add SSO"
+      visible={isAddSSOModalVisible}
+      width={800}
+      footer={null}
+      onOk={handleAddSSOOk}
+      onCancel={handleAddSSOCancel}
+    >
+
+        <Form
+          form={form}
+          onFinish={handleShowInstructions}
+          labelCol={{ span: 8 }}
+          wrapperCol={{ span: 16 }}
+          labelAlign="left"
+        >
+          <>
+          <Form.Item
+              label="Admin Email"
+              name="user_email"
+              rules={[{ required: true, message: "Please enter the email of the proxy admin" }]}
+            >
+              <Input />
+            </Form.Item>
+            <Form.Item
+              label="PROXY BASE URL"
+              name="proxy_base_url"
+              rules={[{ required: true, message: "Please enter the proxy base url" }]}
+            >
+              <Input />
+            </Form.Item>
+
+            <Form.Item
+              label="GOOGLE CLIENT ID"
+              name="google_client_id"
+              rules={[{ required: true, message: "Please enter the google client id" }]}
+            >
+              <Input.Password />
+            </Form.Item>
+
+            <Form.Item
+              label="GOOGLE CLIENT SECRET"
+              name="google_client_secret"
+              rules={[{ required: true, message: "Please enter the google client secret" }]}
+            >
+              <Input.Password />
+            </Form.Item>
+          </>
+          <div style={{ textAlign: "right", marginTop: "10px" }}>
+            <Button2 htmlType="submit">Save</Button2>
+          </div>
+        </Form>
+
+    </Modal>
+    <Modal
+      title="SSO Setup Instructions"
+      visible={isInstructionsModalVisible}
+      width={800}
+      footer={null}
+      onOk={handleInstructionsOk}
+      onCancel={handleInstructionsCancel}
+    >
+      <p>Follow these steps to complete the SSO setup:</p>
+      <Text className="mt-2">
+        1. DO NOT Exit this TAB
+      </Text>
+      <Text className="mt-2">
+        2. Open a new tab, visit your proxy base url
+      </Text>
+      <Text className="mt-2">
+        3. Confirm your SSO is configured correctly and you can login on the new Tab
+      </Text>
+      <Text className="mt-2">
+        4. If Step 3 is successful, you can close this tab
+      </Text>
+      <div style={{ textAlign: "right", marginTop: "10px" }}>
+        <Button2 onClick={handleInstructionsOk}>Done</Button2>
+    </div>
+    </Modal>
+  </div>
+  <Callout title="Login without SSO" color="teal">
+      If you need to login without sso, you can access <a href= {nonSssoUrl} target="_blank"><b>{nonSssoUrl}</b>  </a>
+  </Callout>
+</Grid>
    </div>
  );
 };
--- a/ui/litellm-dashboard/src/components/leftnav.tsx
+++ b/ui/litellm-dashboard/src/components/leftnav.tsx
@ -46,8 +46,8 @@ const Sidebar: React.FC<SidebarProps> = ({
    );
  }
  return (
-    <Layout style={{ minHeight: "100vh", maxWidth: "100px" }}>
-      <Sider width={100}>
+    <Layout style={{ minHeight: "100vh", maxWidth: "120px" }}>
+      <Sider width={120}>
        <Menu
          mode="inline"
          defaultSelectedKeys={defaultSelectedKey ? defaultSelectedKey : ["1"]}
@ -63,6 +63,11 @@ const Sidebar: React.FC<SidebarProps> = ({
            Test Key
            </Text>
          </Menu.Item>
+          <Menu.Item key="2" onClick={() => setPage("models")}>
+          <Text>
+            Models
+            </Text>
+          </Menu.Item>
          {userRole == "Admin" ? (
            <Menu.Item key="6" onClick={() => setPage("teams")}>
              <Text>
@ -82,10 +87,10 @@ const Sidebar: React.FC<SidebarProps> = ({
              </Text>
            </Menu.Item>
          ) : null}
-          <Menu.Item key="2" onClick={() => setPage("models")}>
+          <Menu.Item key="8" onClick={() => setPage("settings")}>
          <Text>
-            Models
-            </Text>
+            Integrations
+          </Text>
          </Menu.Item>
          {userRole == "Admin" ? (
            <Menu.Item key="7" onClick={() => setPage("admin-panel")}>
--- a/ui/litellm-dashboard/src/components/model_dashboard.tsx
+++ b/ui/litellm-dashboard/src/components/model_dashboard.tsx
@ -13,9 +13,9 @@ import {
  Text,
  Grid,
 } from "@tremor/react";
-import { TabPanel, TabPanels, TabGroup, TabList, Tab, TextInput } from "@tremor/react";
-import { Select, SelectItem } from "@tremor/react";
-import { modelInfoCall, userGetRequesedtModelsCall, modelMetricsCall, modelCreateCall, Model } from "./networking";
+import { TabPanel, TabPanels, TabGroup, TabList, Tab, TextInput, Icon } from "@tremor/react";
+import { Select, SelectItem, MultiSelect, MultiSelectItem } from "@tremor/react";
+import { modelInfoCall, userGetRequesedtModelsCall, modelMetricsCall, modelCreateCall, Model, modelCostMap, modelDeleteCall } from "./networking";
 import { BarChart } from "@tremor/react";
 import {
  Button as Button2,
@ -33,7 +33,8 @@ import {
 import { Badge, BadgeDelta, Button } from "@tremor/react";
 import RequestAccess from "./request_model_access";
 import { Typography } from "antd";
-
+import TextArea from "antd/es/input/TextArea";
+import { InformationCircleIcon, PencilAltIcon, PencilIcon, StatusOnlineIcon, TrashIcon } from "@heroicons/react/outline";
 const { Title: Title2, Link } = Typography;

 interface ModelDashboardProps {
@ -43,6 +44,26 @@ interface ModelDashboardProps {
  userID: string | null;
 }

+//["OpenAI", "Azure OpenAI", "Anthropic", "Gemini (Google AI Studio)", "Amazon Bedrock", "OpenAI-Compatible Endpoints (Groq, Together AI, Mistral AI, etc.)"]
+
+enum Providers {
+  OpenAI = "OpenAI",
+  Azure = "Azure",
+  Anthropic = "Anthropic",
+  Google_AI_Studio = "Gemini (Google AI Studio)",
+  Bedrock = "Amazon Bedrock",
+  OpenAI_Compatible = "OpenAI-Compatible Endpoints (Groq, Together AI, Mistral AI, etc.)"
+}
+
+const provider_map: Record <string, string> = {
+  "OpenAI": "openai",
+  "Azure": "azure",
+  "Anthropic": "anthropic",
+  "Google_AI_Studio": "gemini",
+  "Bedrock": "bedrock",
+  "OpenAI_Compatible": "openai"
+};
+
 const ModelDashboard: React.FC<ModelDashboardProps> = ({
  accessToken,
  token,
@ -53,8 +74,12 @@ const ModelDashboard: React.FC<ModelDashboardProps> = ({
  const [modelMetrics, setModelMetrics] = useState<any[]>([]);
  const [pendingRequests, setPendingRequests] = useState<any[]>([]);
  const [form] = Form.useForm();
+  const [modelMap, setModelMap] = useState<any>(null);

-  const providers = ["OpenAI", "Azure OpenAI", "Anthropic", "Gemini (Google AI Studio)", "Amazon Bedrock", "OpenAI-Compatible Endpoints (Groq, Together AI, Mistral AI, etc.)"]
+  const [providerModels, setProviderModels] = useState<Array<string>>([]); // Explicitly typing providerModels as a string array
+
+  const providers: Providers[] = [Providers.OpenAI, Providers.Azure, Providers.Anthropic, Providers.Google_AI_Studio, Providers.Bedrock, Providers.OpenAI_Compatible]
+  
  const [selectedProvider, setSelectedProvider] = useState<String>("OpenAI");

  useEffect(() => {
@ -95,7 +120,16 @@ const ModelDashboard: React.FC<ModelDashboardProps> = ({
    if (accessToken && token && userRole && userID) {
      fetchData();
    }
-  }, [accessToken, token, userRole, userID]);
+
+    const fetchModelMap = async () => {
+      const data = await modelCostMap()
+      console.log(`received model cost map data: ${Object.keys(data)}`)
+      setModelMap(data)
+    }
+    if (modelMap == null) {
+      fetchModelMap()
+    }
+  }, [accessToken, token, userRole, userID, modelMap]);

  if (!modelData) {
    return <div>Loading...</div>;
@ -109,7 +143,7 @@ const ModelDashboard: React.FC<ModelDashboardProps> = ({
  // loop through model data and edit each row
  for (let i = 0; i < modelData.data.length; i++) {
    let curr_model = modelData.data[i];
-    let litellm_model_name = curr_model?.litellm_params?.mode
+    let litellm_model_name = curr_model?.litellm_params?.model
    let model_info = curr_model?.model_info;

    let defaultProvider = "openai";
@ -117,6 +151,22 @@ const ModelDashboard: React.FC<ModelDashboardProps> = ({
    let input_cost = "Undefined";
    let output_cost = "Undefined";
    let max_tokens = "Undefined";
+    let cleanedLitellmParams = {};
+
+    const getProviderFromModel = (model: string) => {
+      /**
+       * Use model map
+       * - check if model in model map
+       * - return it's litellm_provider, if so 
+       */
+      console.log(`GET PROVIDER CALLED! - ${modelMap}`)
+      if (modelMap !== null && modelMap !== undefined) {
+        if (typeof modelMap == "object" && model in modelMap) {
+          return modelMap[model]["litellm_provider"]
+        }
+      }
+      return "openai"
+    }

    // Check if litellm_model_name is null or undefined
    if (litellm_model_name) {
@ -127,10 +177,10 @@ const ModelDashboard: React.FC<ModelDashboardProps> = ({
      let firstElement = splitModel[0];

      // If there is only one element, default provider to openai
-      provider = splitModel.length === 1 ? defaultProvider : firstElement;
+      provider = splitModel.length === 1 ? getProviderFromModel(litellm_model_name) : firstElement;
    } else {
      // litellm_model_name is null or undefined, default provider to openai
-      provider = defaultProvider;
+      provider = "openai"
    }

    if (model_info) {
@ -138,11 +188,22 @@ const ModelDashboard: React.FC<ModelDashboardProps> = ({
      output_cost = model_info?.output_cost_per_token;
      max_tokens = model_info?.max_tokens;
    }
+
+    // let cleanedLitellmParams == litellm_params without model, api_base
+    if (curr_model?.litellm_params) {
+      cleanedLitellmParams = Object.fromEntries(
+        Object.entries(curr_model?.litellm_params).filter(
+          ([key]) => key !== "model" && key !== "api_base"
+        )
+      );
+    } 
+
    modelData.data[i].provider = provider;
    modelData.data[i].input_cost = input_cost;
    modelData.data[i].output_cost = output_cost;
    modelData.data[i].max_tokens = max_tokens;
    modelData.data[i].api_base = curr_model?.litellm_params?.api_base;
+    modelData.data[i].cleanedLitellmParams = cleanedLitellmParams;

    all_models_on_proxy.push(curr_model.model_name);

@ -162,43 +223,115 @@ const ModelDashboard: React.FC<ModelDashboardProps> = ({
    );
  }

+  const handleDelete = async (model_id: string) => {
+    await modelDeleteCall(accessToken, model_id)
+  };
+
+
+  const setProviderModelsFn = (provider: string) => {
+    console.log(`received provider string: ${provider}`)
+    const providerEnumValue = Providers[provider as keyof typeof Providers];
+    console.log(`received providerEnumValue: ${providerEnumValue}`)
+    const mappingResult = provider_map[providerEnumValue]; // Get the corresponding value from the mapping
+    console.log(`mappingResult: ${mappingResult}`)
+    let _providerModels: Array<string> = []
+    if (typeof modelMap === 'object') {
+      Object.entries(modelMap).forEach(([key, value]) => {
+        if (value !== null && typeof value === 'object' && "litellm_provider" in value && value["litellm_provider"] === mappingResult) {
+          _providerModels.push(key);
+        }
+      });
+    }
+    setProviderModels(_providerModels)
+    console.log(`providerModels: ${providerModels}`);
+  }
+
  const handleSubmit = async (formValues: Record<string, any>) => {
-    const litellmParamsObj: Record<string, any>  = {};
-    const modelInfoObj: Record<string, any>  = {};
-    let modelName: string  = "";
-    // Iterate through the key-value pairs in formValues
-    for (const [key, value] of Object.entries(formValues)) {
-      if (key == "model_name") {
-        modelName = value
+    try {
+      /**
+       * For multiple litellm model names - create a separate deployment for each 
+       * - get the list
+       * - iterate through it 
+       * - create a new deployment for each
+       */
+
+      // get the list of deployments
+      let deployments: Array<string> = Object.values(formValues["model"])
+      console.log(`received deployments: ${deployments}`)
+      console.log(`received type of deployments: ${typeof deployments}`)
+      deployments.forEach(async (litellm_model) => { 
+        console.log(`litellm_model: ${litellm_model}`)
+        const litellmParamsObj: Record<string, any>  = {};
+        const modelInfoObj: Record<string, any>  = {};
+        // Iterate through the key-value pairs in formValues
+        litellmParamsObj["model"] = litellm_model
+        let modelName: string  = "";
+        for (const [key, value] of Object.entries(formValues)) {
+          if (key == "model_name") {
+            modelName = modelName + value
+          }
+          else if (key == "custom_llm_provider") {
+            // const providerEnumValue = Providers[value as keyof typeof Providers];
+            // const mappingResult = provider_map[providerEnumValue]; // Get the corresponding value from the mapping
+            // modelName = mappingResult + "/" + modelName
+            continue
+          }
+          else if (key == "model") {
+            continue
+          }
+
+          // Check if key is "base_model"
+          else if (key === "base_model") {
+            // Add key-value pair to model_info dictionary
+            modelInfoObj[key] = value;
+          }
+
+          else if (key == "litellm_extra_params") {
+            console.log("litellm_extra_params:", value);
+            let litellmExtraParams = {};
+            if (value && value != undefined) {
+              try {
+                litellmExtraParams = JSON.parse(value);
+              }
+              catch (error) {
+                message.error("Failed to parse LiteLLM Extra Params: " + error);
+                throw new Error("Failed to parse litellm_extra_params: " + error);
+              }
+              for (const [key, value] of Object.entries(litellmExtraParams)) {
+                litellmParamsObj[key] = value;
+              }
+            }
+          }
+
+          // Check if key is any of the specified API related keys
+          else {
+            // Add key-value pair to litellm_params dictionary
+            litellmParamsObj[key] = value;
+          }
+        }
+
+        const new_model: Model = {  
+          "model_name": modelName,
+          "litellm_params": litellmParamsObj,
+          "model_info": modelInfoObj
+        }
+  
+        
+  
+        const response: any = await modelCreateCall(
+          accessToken,
+          new_model
+        );
+
+        console.log(`response for model create call: ${response["data"]}`);
+      }); 
+      
+      form.resetFields();
+
+      
+      } catch (error) {
+        message.error("Failed to create model: " + error);
      }
-
-      // Check if key is any of the specified API related keys
-      if (key === "api_key" || key === "model" || key === "api_base" || key === "api_version" || key.startsWith("aws_")) {
-        // Add key-value pair to litellm_params dictionary
-        litellmParamsObj[key] = value;
-      }
-
-      // Check if key is "base_model"
-      if (key === "base_model") {
-        // Add key-value pair to model_info dictionary
-        modelInfoObj[key] = value;
-      }
-    }
-
-    const new_model: Model = {  
-      "model_name": modelName,
-      "litellm_params": litellmParamsObj,
-      "model_info": modelInfoObj
-    }
-
-    
-
-    const response: any = await modelCreateCall(
-      accessToken,
-      new_model
-    );
-
-    console.log(`response for model create call: ${response["data"]}`);
  }

  const handleOk = () => {
@ -206,7 +339,7 @@ const ModelDashboard: React.FC<ModelDashboardProps> = ({
        .validateFields()
        .then((values) => {
          handleSubmit(values);
-          form.resetFields();
+          // form.resetFields();
        })
        .catch((error) => {
          console.error("Validation failed:", error);
@ -214,7 +347,7 @@ const ModelDashboard: React.FC<ModelDashboardProps> = ({
  };

  console.log(`selectedProvider: ${selectedProvider}`)
-
+  console.log(`providerModels.length: ${providerModels.length}`)
  return (
    <div style={{ width: "100%", height: "100%"}}>
      <TabGroup className="gap-2 p-8 h-[75vh] w-full mt-2">
@ -244,7 +377,7 @@ const ModelDashboard: React.FC<ModelDashboardProps> = ({
                  )
                }
                <TableHeaderCell>
-                  Access
+                  Extra litellm Params
                </TableHeaderCell>
                <TableHeaderCell>Input Price per token ($)</TableHeaderCell>
                <TableHeaderCell>Output Price per token ($)</TableHeaderCell>
@ -252,8 +385,8 @@ const ModelDashboard: React.FC<ModelDashboardProps> = ({
              </TableRow>
            </TableHead>
            <TableBody>
-              {modelData.data.map((model: any) => (
-                <TableRow key={model.model_name}>
+              {modelData.data.map((model: any, index: number) => (
+                <TableRow key={index}>
                  <TableCell>
                    <Text>{model.model_name}</Text>
                  </TableCell>
@ -265,20 +398,15 @@ const ModelDashboard: React.FC<ModelDashboardProps> = ({
                  }

                  <TableCell>
-                    {model.user_access ? (
-                      <Badge color={"green"}>Yes</Badge>
-                    ) : (
-                      <RequestAccess
-                        userModels={all_models_on_proxy}
-                        accessToken={accessToken}
-                        userID={userID}
-                      ></RequestAccess>
-                    )}
+                    <pre>
+                    {JSON.stringify(model.cleanedLitellmParams, null, 2)}
+                    </pre>
                  </TableCell>

                  <TableCell>{model.input_cost}</TableCell>
                  <TableCell>{model.output_cost}</TableCell>
                  <TableCell>{model.max_tokens}</TableCell>
+                  <TableCell><Icon icon={TrashIcon} size="sm" onClick={() => handleDelete(model.model_info.id)}/></TableCell>
                </TableRow>
              ))}
            </TableBody>
@ -331,6 +459,7 @@ const ModelDashboard: React.FC<ModelDashboardProps> = ({
                      key={index}
                      value={provider}
                      onClick={() => {
+                        setProviderModelsFn(provider);
                        setSelectedProvider(provider);
                      }}
                    >
@ -344,18 +473,28 @@ const ModelDashboard: React.FC<ModelDashboardProps> = ({
                </Form.Item>
                <Row>
                <Col span={10}></Col>
-                <Col span={10}><Text className="mb-3 mt-1">Model name your users will pass in. Also used for <Link href="https://docs.litellm.ai/docs/proxy/reliability#step-1---set-deployments-on-config" target="_blank">loadbalancing.</Link></Text></Col>
+                <Col span={10}><Text className="mb-3 mt-1">Model name your users will pass in.</Text></Col>
                </Row>
-                <Form.Item rules={[{ required: true, message: 'Required' }]} label="LiteLLM Model Name" name="model" tooltip="Actual model name used for making litellm.completion() call." className="mb-0">
-                  <TextInput placeholder="gpt-3.5-turbo-0125"/>
+                <Form.Item rules={[{ required: true, message: 'Required' }]} label="LiteLLM Model Name(s)" name="model" tooltip="Actual model name used for making litellm.completion() call." className="mb-0">
+                  {selectedProvider === Providers.Azure ? (
+                      <TextInput placeholder="Enter model name" />
+                    ) : providerModels.length > 0 ? (
+                      <MultiSelect value={providerModels}>
+                        {providerModels.map((model, index) => (
+                          <MultiSelectItem key={index} value={model}>
+                            {model}
+                          </MultiSelectItem>
+                        ))}
+                      </MultiSelect>
+                    ) : (
+                      <TextInput placeholder="gpt-3.5-turbo-0125" />
+                    )}
                </Form.Item>
                <Row>
                <Col span={10}></Col>
-                <Col span={10}><Text className="mb-3 mt-1">Actual model name used for making <Link href="https://docs.litellm.ai/docs/providers" target="_blank">litellm.completion() call</Link></Text></Col>
-                </Row>
-                
+                <Col span={10}><Text className="mb-3 mt-1">Actual model name used for making<Link href="https://docs.litellm.ai/docs/providers" target="_blank">litellm.completion() call</Link>.We&apos;ll<Link href="https://docs.litellm.ai/docs/proxy/reliability#step-1---set-deployments-on-config" target="_blank">loadbalance</Link> models with the same &apos;public name&apos;</Text></Col></Row>
                {
-                  selectedProvider != "Amazon Bedrock" && <Form.Item
+                  selectedProvider != Providers.Bedrock && <Form.Item
                  rules={[{ required: true, message: 'Required' }]}
                    label="API Key"
                    name="api_key"
@ -364,7 +503,15 @@ const ModelDashboard: React.FC<ModelDashboardProps> = ({
                  </Form.Item>
                }
                {
-                  selectedProvider == "Azure OpenAI" && <Form.Item
+                  selectedProvider == Providers.OpenAI && <Form.Item
+                    label="Organization ID"
+                    name="organization_id"
+                  >
+                    <TextInput placeholder="[OPTIONAL] my-unique-org"/>
+                  </Form.Item>
+                }
+                {
+                  (selectedProvider == Providers.Azure || selectedProvider == Providers.OpenAI_Compatible) && <Form.Item
                  rules={[{ required: true, message: 'Required' }]}
                  label="API Base"
                  name="api_base"
@ -373,7 +520,7 @@ const ModelDashboard: React.FC<ModelDashboardProps> = ({
                </Form.Item>
                }
                {
-                  selectedProvider == "Azure OpenAI" && <Form.Item
+                  selectedProvider == Providers.Azure && <Form.Item
                  rules={[{ required: true, message: 'Required' }]}
                  label="API Version"
                  name="api_version"
@ -382,7 +529,7 @@ const ModelDashboard: React.FC<ModelDashboardProps> = ({
                </Form.Item>
                }
                {
-                  selectedProvider == "Azure OpenAI" && <Form.Item
+                  selectedProvider == Providers.Azure && <Form.Item
                  label="Base Model"
                  name="base_model"
                >
@ -391,7 +538,7 @@ const ModelDashboard: React.FC<ModelDashboardProps> = ({
                </Form.Item>
                }
                {
-                  selectedProvider == "Amazon Bedrock" && <Form.Item
+                  selectedProvider == Providers.Bedrock && <Form.Item
                  rules={[{ required: true, message: 'Required' }]}
                  label="AWS Access Key ID"
                  name="aws_access_key_id"
@ -401,7 +548,7 @@ const ModelDashboard: React.FC<ModelDashboardProps> = ({
                </Form.Item>
                }
                {
-                  selectedProvider == "Amazon Bedrock" && <Form.Item
+                  selectedProvider == Providers.Bedrock && <Form.Item
                  rules={[{ required: true, message: 'Required' }]}
                  label="AWS Secret Access Key"
                  name="aws_secret_access_key"
@ -411,7 +558,7 @@ const ModelDashboard: React.FC<ModelDashboardProps> = ({
                </Form.Item>
                }
                {
-                  selectedProvider == "Amazon Bedrock" && <Form.Item
+                  selectedProvider == Providers.Bedrock && <Form.Item
                  rules={[{ required: true, message: 'Required' }]}
                  label="AWS Region Name"
                  name="aws_region_name"
@ -420,6 +567,22 @@ const ModelDashboard: React.FC<ModelDashboardProps> = ({
                  <TextInput placeholder="us-east-1"/>
                </Form.Item>
                }
+                <Form.Item label="LiteLLM Params" name="litellm_extra_params" tooltip="Optional litellm params used for making a litellm.completion() call." className="mb-0">
+                <TextArea
+                  rows={4}
+                  placeholder='{
+                    "rpm": 100,
+                    "timeout": 0,
+                    "stream_timeout": 0
+                  }'
+                />
+
+                </Form.Item>
+                <Row>
+                <Col span={10}></Col>
+                <Col span={10}><Text className="mb-3 mt-1">Pass JSON of litellm supported params <Link href="https://docs.litellm.ai/docs/completion/input" target="_blank">litellm.completion() call</Link></Text></Col>
+                </Row>
+
              </>
              <div style={{ textAlign: "center", marginTop: "10px" }}>
                <Button2 htmlType="submit">Add Model</Button2>
--- a/ui/litellm-dashboard/src/components/navbar.tsx
+++ b/ui/litellm-dashboard/src/components/navbar.tsx
@ -32,6 +32,7 @@ const Navbar: React.FC<NavbarProps> = ({
 }) => {
  console.log("User ID:", userID);
  console.log("userEmail:", userEmail);
+  console.log("showSSOBanner:", showSSOBanner);

  // const userColors = require('./ui_colors.json') || {};
  const isLocal = process.env.NODE_ENV === "development";
@ -67,13 +68,25 @@ const Navbar: React.FC<NavbarProps> = ({
        </div>
      </div>
      <div className="text-right mx-4 my-2 absolute top-0 right-0 flex items-center justify-end space-x-2">
-        {showSSOBanner ? (
+      {showSSOBanner ? (
+          
+        <div style={{
+          // border: '1px solid #391085',
+          padding: '6px',
+          borderRadius: '8px', // Added border-radius property
+        }}
+      >
          <a
-            href="https://docs.litellm.ai/docs/proxy/ui#setup-ssoauth-for-ui"
+            href="https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat"
            target="_blank"
-            className="mr-2"
+            style={{
+              "fontSize": "14px",
+              "textDecoration": "underline"
+            }}
          >
+            Request hosted proxy
          </a>
+          </div>
        ) : null}

        <div style={{
--- a/ui/litellm-dashboard/src/components/networking.tsx
+++ b/ui/litellm-dashboard/src/components/networking.tsx
@ -12,6 +12,18 @@ export interface Model {
  model_info: Object | null;
 }

+export const modelCostMap = async () => {
+  try {
+    const response = await fetch('https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json');
+    const jsonData = await response.json();
+    console.log(`received data: ${jsonData}`)
+    return jsonData
+  } catch (error) {
+    console.error("Failed to get model cost map:", error);
+    throw error;
+  }
+}
+
 export const modelCreateCall = async (
  accessToken: string,
  formValues: Model
@ -38,7 +50,42 @@ export const modelCreateCall = async (

    const data = await response.json();
    console.log("API Response:", data);
-    message.success("Model created successfully. Wait 60s and refresh.")
+    message.success("Model created successfully. Wait 60s and refresh on 'All Models' page");
+    return data;
+  } catch (error) {
+    console.error("Failed to create key:", error);
+    throw error;
+  }
+}
+
+export const modelDeleteCall = async (  
+  accessToken: string,
+  model_id: string,
+) => {
+  console.log(`model_id in model delete call: ${model_id}`)
+  try {
+    const url = proxyBaseUrl ? `${proxyBaseUrl}/model/delete` : `/model/delete`;
+    const response = await fetch(url, {
+      method: "POST",
+      headers: {
+        Authorization: `Bearer ${accessToken}`,
+        "Content-Type": "application/json",
+      },
+      body: JSON.stringify({
+        "id": model_id, 
+      }),
+    });
+
+    if (!response.ok) {
+      const errorData = await response.text();
+      message.error("Failed to create key: " + errorData);
+      console.error("Error response from the server:", errorData);
+      throw new Error("Network response was not ok");
+    }
+
+    const data = await response.json();
+    console.log("API Response:", data);
+    message.success("Model deleted successfully. Restart server to see this.");
    return data;
  } catch (error) {
    console.error("Failed to create key:", error);
@ -339,6 +386,7 @@ export const modelInfoCall = async (
    }

    const data = await response.json();
+    console.log("modelInfoCall:", data);
    //message.info("Received model data");
    return data;
    // Handle success - you might want to update some state or UI based on the created key
@ -1008,22 +1056,25 @@ export const teamMemberAddCall = async (

 export const userUpdateUserCall = async (
  accessToken: string,
-  formValues: any // Assuming formValues is an object
+  formValues: any, // Assuming formValues is an object
+  userRole: string | null
 ) => {
  try {
    console.log("Form Values in userUpdateUserCall:", formValues); // Log the form values before making the API call

    const url = proxyBaseUrl ? `${proxyBaseUrl}/user/update` : `/user/update`;
+    let response_body = {...formValues};
+    if (userRole !== null) {
+      response_body["user_role"] = userRole;
+    }
+    response_body = JSON.stringify(response_body);
    const response = await fetch(url, {
      method: "POST",
      headers: {
        Authorization: `Bearer ${accessToken}`,
        "Content-Type": "application/json",
      },
-      body: JSON.stringify({
-        user_role: "proxy_admin_viewer",
-        ...formValues, // Include formValues in the request body
-      }),
+      body: response_body,
    });

    if (!response.ok) {
@ -1119,3 +1170,85 @@ export const slackBudgetAlertsHealthCheck = async (accessToken: String) => {
  }
 };

+
+
+export const getCallbacksCall = async (
+  accessToken: String,
+  userID: String,
+  userRole: String
+) => {
+  /**
+   * Get all the models user has access to
+   */
+  try {
+    let url = proxyBaseUrl ? `${proxyBaseUrl}/get/config/callbacks` : `/get/config/callbacks`;
+
+    //message.info("Requesting model data");
+    const response = await fetch(url, {
+      method: "GET",
+      headers: {
+        Authorization: `Bearer ${accessToken}`,
+        "Content-Type": "application/json",
+      },
+    });
+
+    if (!response.ok) {
+      const errorData = await response.text();
+      message.error(errorData);
+      throw new Error("Network response was not ok");
+    }
+
+    const data = await response.json();
+    //message.info("Received model data");
+    return data;
+    // Handle success - you might want to update some state or UI based on the created key
+  } catch (error) {
+    console.error("Failed to get callbacks:", error);
+    throw error;
+  }
+};
+
+
+
+
+
+
+export const setCallbacksCall = async (
+  accessToken: String,
+  formValues: Record<string, any>
+) => {
+  /**
+   * Set callbacks on proxy
+   */
+  try {
+    let url = proxyBaseUrl ? `${proxyBaseUrl}/config/update` : `/config/update`;
+
+    //message.info("Requesting model data");
+    const response = await fetch(url, {
+      method: "POST",
+      headers: {
+        Authorization: `Bearer ${accessToken}`,
+        "Content-Type": "application/json",
+      },
+      body: JSON.stringify({
+        ...formValues, // Include formValues in the request body
+      }),
+    });
+
+    if (!response.ok) {
+      const errorData = await response.text();
+      message.error(errorData);
+      throw new Error("Network response was not ok");
+    }
+
+    const data = await response.json();
+    //message.info("Received model data");
+    return data;
+    // Handle success - you might want to update some state or UI based on the created key
+  } catch (error) {
+    console.error("Failed to set callbacks:", error);
+    throw error;
+  }
+};
+
+
--- a/ui/litellm-dashboard/src/components/settings.tsx
+++ b/ui/litellm-dashboard/src/components/settings.tsx
@ -0,0 +1,211 @@
+import React, { useState, useEffect } from "react";
+import {
+  Card,
+  Title,
+  Subtitle,
+  Table,
+  TableHead,
+  TableRow,
+  Badge,
+  TableHeaderCell,
+  TableCell,
+  TableBody,
+  Metric,
+  Text,
+  Grid,
+  Button,
+  Col,
+} from "@tremor/react";
+import { getCallbacksCall, setCallbacksCall } from "./networking";
+import { Modal, Form, Input, Select, Button as Button2 } from "antd";
+import StaticGenerationSearchParamsBailoutProvider from "next/dist/client/components/static-generation-searchparams-bailout-provider";
+
+interface SettingsPageProps {
+  accessToken: string | null;
+  userRole: string | null;
+  userID: string | null;
+}
+
+const Settings: React.FC<SettingsPageProps> = ({
+  accessToken,
+  userRole,
+  userID,
+}) => {
+  const [callbacks, setCallbacks] = useState<string[]>([]);
+  const [isModalVisible, setIsModalVisible] = useState(false);
+  const [form] = Form.useForm();
+  const [selectedCallback, setSelectedCallback] = useState<string | null>(null);
+
+  useEffect(() => {
+    if (!accessToken || !userRole || !userID) {
+      return;
+    }
+    getCallbacksCall(accessToken, userID, userRole).then((data) => {
+      console.log("callbacks", data);
+      let callbacks_data = data.data;
+      let callback_names = callbacks_data.success_callback; // ["callback1", "callback2"]
+      setCallbacks(callback_names);
+    });
+  }, [accessToken, userRole, userID]);
+
+  const handleAddCallback = () => {
+    console.log("Add callback clicked");
+    setIsModalVisible(true);
+  };
+
+  const handleCancel = () => {
+    setIsModalVisible(false);
+    form.resetFields();
+    setSelectedCallback(null);
+  };
+
+  const handleOk = () => {
+    if (!accessToken) {
+      return;
+    }
+    // Handle form submission
+    form.validateFields().then((values) => {
+      // Call API to add the callback
+      console.log("Form values:", values);
+      let payload;
+      if (values.callback === 'langfuse') {
+        payload = {
+          environment_variables: {
+            LANGFUSE_PUBLIC_KEY: values.langfusePublicKey,
+            LANGFUSE_SECRET_KEY: values.langfusePrivateKey
+          },
+          litellm_settings: {
+            success_callback: [values.callback]
+          }
+        };
+        setCallbacksCall(accessToken, payload);
+
+        // add langfuse to callbacks
+        setCallbacks(callbacks ? [...callbacks, values.callback] : [values.callback]);
+      } else if (values.callback === 'slack') {
+        payload = {
+          general_settings: {
+            alerting: ["slack"],
+            alerting_threshold: 300
+          },
+          environment_variables: {
+            SLACK_WEBHOOK_URL: values.slackWebhookUrl
+          }
+        };
+        setCallbacksCall(accessToken, payload);
+
+        // add slack to callbacks
+        setCallbacks(callbacks ? [...callbacks, values.callback] : [values.callback]);
+      } else {
+        payload = {
+          error: 'Invalid callback value'
+        };
+      }
+      setIsModalVisible(false);
+      form.resetFields();
+      setSelectedCallback(null);
+    });
+  };
+
+  const handleCallbackChange = (value: string) => {
+    setSelectedCallback(value);
+  };
+
+  return (
+    <div className="w-full mx-4">
+      <Grid numItems={1} className="gap-2 p-8 h-[75vh] w-full mt-2">
+        <Card className="h-[15vh]">
+          <Grid numItems={2} className="mt-2">
+            <Col>
+              <Title>Logging Callbacks</Title>
+            </Col>
+            <Col>
+            <div>
+            {!callbacks ? (
+                <Badge color={"red"}>None</Badge>
+                ) : callbacks.length === 0 ? (
+                <Badge>None</Badge>
+                ) : (
+                callbacks.map((callback, index) => (
+                    <Badge key={index} color={"sky"}>
+                    {callback}
+                    </Badge>
+                ))
+                )}
+            </div>
+            </Col>
+          </Grid>
+          <Col>
+            <Button size="xs" className="mt-2" onClick={handleAddCallback}>
+              Add Callback
+            </Button>
+          </Col>
+        </Card>
+      </Grid>
+
+      <Modal
+        title="Add Callback"
+        visible={isModalVisible}
+        onOk={handleOk}
+        width={800}
+        onCancel={handleCancel}
+        footer={null}
+      >
+        <Form form={form} layout="vertical" onFinish={handleOk}>
+          <Form.Item
+            label="Callback"
+            name="callback"
+            rules={[{ required: true, message: "Please select a callback" }]}
+          >
+            <Select onChange={handleCallbackChange}>
+              <Select.Option value="langfuse">langfuse</Select.Option>
+              <Select.Option value="slack">slack alerting</Select.Option>
+            </Select>
+          </Form.Item>
+
+          {selectedCallback === 'langfuse' && (
+            <>
+              <Form.Item
+                label="LANGFUSE_PUBLIC_KEY"
+                name="langfusePublicKey"
+                rules={[
+                  { required: true, message: "Please enter the public key" },
+                ]}
+              >
+                <Input.Password />
+              </Form.Item>
+
+              <Form.Item
+                label="LANGFUSE_PRIVATE_KEY"
+                name="langfusePrivateKey"
+                rules={[
+                  { required: true, message: "Please enter the private key" },
+                ]}
+              >
+                <Input.Password />
+              </Form.Item>
+            </>
+          )}
+
+          {selectedCallback === 'slack' && (
+            <Form.Item
+              label="SLACK_WEBHOOK_URL"
+              name="slackWebhookUrl"
+              rules={[
+                { required: true, message: "Please enter the Slack webhook URL" },
+              ]}
+            >
+              <Input />
+            </Form.Item>
+          )}
+
+          <div style={{ textAlign: "right", marginTop: "10px" }}>
+            <Button2 htmlType="submit">Save</Button2>
+          </div>
+        </Form>
+      </Modal>
+    </div>
+  );
+};
+
+export default Settings;
--- a/ui/litellm-dashboard/src/components/usage.tsx
+++ b/ui/litellm-dashboard/src/components/usage.tsx
@ -274,6 +274,7 @@ const UsagePage: React.FC<UsagePageProps> = ({
            userID={userID}
            userRole={userRole}
            accessToken={accessToken}
+            userSpend={null}
          />
      <TabGroup>
        <TabList className="mt-2">
--- a/ui/litellm-dashboard/src/components/user_dashboard.tsx
+++ b/ui/litellm-dashboard/src/components/user_dashboard.tsx
@ -18,6 +18,7 @@ type UserSpendData = {
  max_budget?: number | null;
 };

+
 interface UserDashboardProps {
  userID: string | null;
  userRole: string | null;
@ -52,6 +53,7 @@ const UserDashboard: React.FC<UserDashboardProps> = ({

  const token = searchParams.get("token");
  const [accessToken, setAccessToken] = useState<string | null>(null);
+  const [teamSpend, setTeamSpend] = useState<number | null>(null);
  const [userModels, setUserModels] = useState<string[]>([]);
  const [selectedTeam, setSelectedTeam] = useState<any | null>(
    teams ? teams[0] : null
@ -174,8 +176,29 @@ const UserDashboard: React.FC<UserDashboardProps> = ({
        fetchData();
      }
    }
+    
  }, [userID, token, accessToken, keys, userRole]);

+  useEffect(() => {
+    // This code will run every time selectedTeam changes
+    if (keys !== null && selectedTeam !== null && selectedTeam !== undefined) {
+      let sum = 0;
+      for (const key of keys) {
+        if (selectedTeam.hasOwnProperty('team_id') && key.team_id !== null && key.team_id === selectedTeam.team_id) {
+          sum += key.spend;
+        }
+      }
+      setTeamSpend(sum);
+    } else if (keys !== null) {
+      // sum the keys which don't have team-id set (default team)
+      let sum = 0 
+      for (const key of keys) {
+        sum += key.spend;
+      }
+      setTeamSpend(sum);
+    }
+  }, [selectedTeam]);
+
  if (userID == null || token == null) {
    // Now you can construct the full URL
    const url = proxyBaseUrl
@ -204,7 +227,7 @@ const UserDashboard: React.FC<UserDashboardProps> = ({
  }

  console.log("inside user dashboard, selected team", selectedTeam);
-
+  console.log(`teamSpend: ${teamSpend}`)
  return (
      <div className="w-full mx-4">
      <Grid numItems={1} className="gap-2 p-8 h-[75vh] w-full mt-2">
@ -213,6 +236,7 @@ const UserDashboard: React.FC<UserDashboardProps> = ({
            userID={userID}
            userRole={userRole}
            accessToken={accessToken}
+            userSpend={teamSpend}
          />

          <ViewKeyTable
--- a/ui/litellm-dashboard/src/components/view_user_spend.tsx
+++ b/ui/litellm-dashboard/src/components/view_user_spend.tsx
@ -31,16 +31,18 @@ interface ViewUserSpendProps {
    userID: string | null;
    userRole: string | null;
    accessToken: string | null;
+    userSpend: number | null;  
 }
-const ViewUserSpend: React.FC<ViewUserSpendProps> = ({ userID, userRole, accessToken }) => {
-    const [spend, setSpend] = useState(0.0);
+const ViewUserSpend: React.FC<ViewUserSpendProps> = ({ userID, userRole, accessToken, userSpend }) => {
+    console.log(`userSpend: ${userSpend}`)
+    let [spend, setSpend] = useState(userSpend !== null ? userSpend : 0.0);
    const [maxBudget, setMaxBudget] = useState(0.0);
    useEffect(() => {
      const fetchData = async () => {
        if (!accessToken || !userID || !userRole) {
          return;
        }
-        if (userRole === "Admin") {
+        if (userRole === "Admin" && userSpend == null) {
          try {
            const globalSpend = await getTotalSpendCall(accessToken);
            if (globalSpend) {
@ -64,13 +66,20 @@ const ViewUserSpend: React.FC<ViewUserSpendProps> = ({ userID, userRole, accessT
      fetchData();
    }, [userRole, accessToken]);

+    useEffect(() => {
+      if (userSpend !== null) {
+        setSpend(userSpend)
+      }
+    }, [userSpend])
+
    const displayMaxBudget = maxBudget !== null ? `$${maxBudget} limit` : "No limit";

-    const roundedSpend = spend !== undefined ? spend.toFixed(5) : null;
+    const roundedSpend = spend !== undefined ? spend.toFixed(4) : null;

+    console.log(`spend in view user spend: ${spend}`)
    return (
        <>
-      <p className="text-tremor-default text-tremor-content dark:text-dark-tremor-content">Total Spend (across all teams)</p>
+      <p className="text-tremor-default text-tremor-content dark:text-dark-tremor-content">Total Spend </p>
      <p className="text-3xl text-tremor-content-strong dark:text-dark-tremor-content-strong font-semibold">${roundedSpend}</p>
        
    </>
				`@ -0,0 +1 @@`
				!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function d(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={id:e,loaded:!1,exports:{}},r=!0;try{a[e](n,n.exports,d),r=!1}finally{r&&delete l[e]}return n.loaded=!0,n.exports}d.m=a,e=[],d.O=function(t,n,r,o){if(n){o=o\|\|0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(d.O).every(function(e){return d.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},d.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return d.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},d.t=function(e,r){if(1&r&&(e=this(e)),8&r\|\|"object"==typeof e&&e&&(4&r&&e.__esModule\|\|16&r&&"function"==typeof e.then))return e;var o=Object.create(null);d.r(o);var u={};t=t\|\|[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},d.d(o,u),o},d.d=function(e,t){for(var n in t)d.o(t,n)&&!d.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},d.f={},d.e=function(e){return Promise.all(Object.keys(d.f).reduce(function(t,n){return d.f[n](e,t),t},[]))},d.u=function(e){},d.miniCssF=function(e){return"static/css/a282d1bfd6ed4df8.css"},d.g=function(){if("object"==typeof globalThis)return globalThis;try{return this\|\|Function("return this")()}catch(e){if("object"==typeof window)return window}}(),d.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",d.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e\|\|l.getAttribute("data-webpack")==o+n){i=l;break}}i\|\|(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,d.nc&&i.setAttribute("nonce",d.nc),i.setAttribute("data-webpack",o+n),i.src=d.tu(e)),r[e]=[t];var s=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(s.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=s.bind(null,i.onerror),i.onload=s.bind(null,i.onload),c&&document.head.appendChild(i)},d.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.nmd=function(e){return e.paths=[],e.children\|\|(e.children=[]),e},d.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},d.tu=function(e){return d.tt().createScriptURL(e)},d.p="/ui/_next/",i={272:0},d.f.j=function(e,t){var n=d.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=d.p+d.u(e),u=Error();d.l(o,function(t){if(d.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},d.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)d.o(u,n)&&(d.m[n]=u[n]);if(c)var a=c(d)}for(e&&e(t);f<o.length;f++)r=o[f],d.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return d.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E\|\|[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();