Merge branch 'BerriAI:main' into main

2025-04-27 19:54:13 +00:00 · 2024-04-16 11:49:26 -07:00 · 2024-04-16 11:49:26 -07:00 · 334772b922
commit 334772b922
parent 4310eeea2c 650a28c80f
160 changed files with 16239 additions and 1783 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -8,6 +8,11 @@ jobs:
    steps:
      - checkout

+      - run:
+          name: Show git commit hash
+          command: |
+            echo "Git commit hash: $CIRCLE_SHA1"
+
      - run:
          name: Check if litellm dir was updated or if pyproject.toml was modified
          command: |
@ -52,6 +57,7 @@ jobs:
            pip install "pytest-mock==3.12.0"
            pip install python-multipart
            pip install google-cloud-aiplatform
+            pip install prometheus-client==0.20.0
      - save_cache:
          paths:
            - ./venv
@ -124,6 +130,7 @@ jobs:
  build_and_test:
    machine:
      image: ubuntu-2204:2023.10.1
+    resource_class: xlarge
    working_directory: ~/project
    steps:
      - checkout
@ -183,12 +190,19 @@ jobs:
              -p 4000:4000 \
              -e DATABASE_URL=$PROXY_DOCKER_DB_URL \
              -e AZURE_API_KEY=$AZURE_API_KEY \
+              -e REDIS_HOST=$REDIS_HOST \
+              -e REDIS_PASSWORD=$REDIS_PASSWORD \
+              -e REDIS_PORT=$REDIS_PORT \
              -e AZURE_FRANCE_API_KEY=$AZURE_FRANCE_API_KEY \
              -e AZURE_EUROPE_API_KEY=$AZURE_EUROPE_API_KEY \
              -e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
              -e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
              -e AWS_REGION_NAME=$AWS_REGION_NAME \
              -e OPENAI_API_KEY=$OPENAI_API_KEY \
+              -e LANGFUSE_PROJECT1_PUBLIC=$LANGFUSE_PROJECT1_PUBLIC \
+              -e LANGFUSE_PROJECT2_PUBLIC=$LANGFUSE_PROJECT2_PUBLIC \
+              -e LANGFUSE_PROJECT1_SECRET=$LANGFUSE_PROJECT1_SECRET \
+              -e LANGFUSE_PROJECT2_SECRET=$LANGFUSE_PROJECT2_SECRET \
              --name my-app \
              -v $(pwd)/proxy_server_config.yaml:/app/config.yaml \
              my-app:latest \
@ -293,7 +307,7 @@ jobs:
              -H "Accept: application/vnd.github.v3+json" \
              -H "Authorization: Bearer $GITHUB_TOKEN" \
              "https://api.github.com/repos/BerriAI/litellm/actions/workflows/ghcr_deploy.yml/dispatches" \
-              -d "{\"ref\":\"main\", \"inputs\":{\"tag\":\"v${VERSION}\"}}"
+              -d "{\"ref\":\"main\", \"inputs\":{\"tag\":\"v${VERSION}\", \"commit_hash\":\"$CIRCLE_SHA1\"}}"

 workflows:
  version: 2
--- a/.dockerignore
+++ b/.dockerignore
@ -1,5 +1,5 @@
-/docs
-/cookbook
-/.circleci
-/.github
-/tests
+docs
+cookbook
+.circleci
+.github
+tests
--- a/.github/workflows/ghcr_deploy.yml
+++ b/.github/workflows/ghcr_deploy.yml
@ -5,10 +5,13 @@ on:
    inputs:
      tag:
        description: "The tag version you want to build"
-      stable:
-        description: "Build Stable version"
-        type: boolean
-        default: false
+      release_type:
+        description: "The release type you want to build. Can be 'latest', 'stable', 'dev'"
+        type: string
+        default: "latest"
+      commit_hash:
+        description: "Commit hash"
+        required: true

 # Defines two custom environment variables for the workflow. Used for the Container registry domain, and a name for the Docker image that this workflow builds.
 env:
@ -89,9 +92,9 @@ jobs:
      - name: Build and push Docker image
        uses: docker/build-push-action@4976231911ebf5f32aad765192d35f942aa48cb8
        with:
-          context: .
+          context: https://github.com/BerriAI/litellm.git#${{ github.event.inputs.commit_hash}}
          push: true
-          tags: ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.stable && 'stable' || 'latest' }} # if a tag is provided, use that, otherwise use the release tag, and if neither is available, use 'latest'
+          tags: ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.release_type }} # if a tag is provided, use that, otherwise use the release tag, and if neither is available, use 'latest'
          labels: ${{ steps.meta.outputs.labels }}
          platforms: local,linux/amd64,linux/arm64,linux/arm64/v8
          
@ -125,10 +128,10 @@ jobs:
      - name: Build and push Database Docker image
        uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
        with:
-          context: .
+          context: https://github.com/BerriAI/litellm.git#${{ github.event.inputs.commit_hash}}
          file: Dockerfile.database
          push: true
-          tags: ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.stable && 'stable' || 'latest' }}
+          tags: ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.release_type }} 
          labels: ${{ steps.meta-database.outputs.labels }} 
          platforms: local,linux/amd64,linux/arm64,linux/arm64/v8
  
@ -162,11 +165,10 @@ jobs:
      - name: Build and push Database Docker image
        uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
        with:
-          context: .
+          context: https://github.com/BerriAI/litellm.git#${{ github.event.inputs.commit_hash}}
          file: ./litellm-js/spend-logs/Dockerfile
          push: true
-          tags: ${{ steps.meta-spend-logs.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-spend-logs.outputs.tags }}-${{ github.event.inputs.stable && 'stable' || 'latest' }}
-          labels: ${{ steps.meta-spend-logs.outputs.labels }} 
+          tags: ${{ steps.meta-spend-logs.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-spend-logs.outputs.tags }}-${{ github.event.inputs.release_type }}
          platforms: local,linux/amd64,linux/arm64,linux/arm64/v8

  build-and-push-helm-chart:
@ -240,10 +242,13 @@ jobs:
        with:
          github-token: "${{ secrets.GITHUB_TOKEN }}"
          script: |
+            const commitHash = "${{ github.event.inputs.commit_hash}}";
+            console.log("Commit Hash:", commitHash); // Add this line for debugging
            try {
              const response = await github.rest.repos.createRelease({
                draft: false,
                generate_release_notes: true,
+                target_commitish: commitHash,
                name: process.env.RELEASE_TAG,
                owner: context.repo.owner,
                prerelease: false,
--- a/.gitignore
+++ b/.gitignore
@ -46,3 +46,7 @@ deploy/charts/*.tgz
 litellm/proxy/vertex_key.json
 **/.vim/
 /node_modules
+kub.yaml
+loadtest_kub.yaml
+litellm/proxy/_new_secret_config.yaml
+litellm/proxy/_new_secret_config.yaml
--- a/3
+++ b/3
@ -70,5 +70,4 @@ EXPOSE 4000/tcp
 ENTRYPOINT ["litellm"]

 # Append "--detailed_debug" to the end of CMD to view detailed debug logs 
-# CMD ["--port", "4000", "--config", "./proxy_server_config.yaml"]
-CMD ["--port", "4000", "--config", "./proxy_server_config.yaml"]
+CMD ["--port", "4000"]
--- a/README.md
+++ b/README.md
@ -34,7 +34,7 @@ LiteLLM manages:
 [**Jump to OpenAI Proxy Docs**](https://github.com/BerriAI/litellm?tab=readme-ov-file#openai-proxy---docs) <br>
 [**Jump to Supported LLM Providers**](https://github.com/BerriAI/litellm?tab=readme-ov-file#supported-providers-docs)

-🚨 **Stable Release:** v1.34.1 
+🚨 **Stable Release:** Use docker images with: `main-stable` tag. These run through 12 hr load tests (1k req./min). 

 Support for more providers. Missing a provider or LLM Platform, raise a [feature request](https://github.com/BerriAI/litellm/issues/new?assignees=&labels=enhancement&projects=&template=feature_request.yml&title=%5BFeature%5D%3A+).

@ -205,7 +205,7 @@ curl 'http://0.0.0.0:4000/key/generate' \
 | [aws - bedrock](https://docs.litellm.ai/docs/providers/bedrock)                     | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                | ✅                                                                            |
 | [google - vertex_ai [Gemini]](https://docs.litellm.ai/docs/providers/vertex)        | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
 | [google - palm](https://docs.litellm.ai/docs/providers/palm)                        | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
-| [google AI Studio - gemini](https://docs.litellm.ai/docs/providers/gemini)          | ✅                                                      |                                                                                 | ✅                                                                                  |                                                                                   |                                                                               |
+| [google AI Studio - gemini](https://docs.litellm.ai/docs/providers/gemini)          | ✅                                                      |       ✅                                                                          | ✅                                                                                  |     ✅                                                                              |                                                                               |
 | [mistral ai api](https://docs.litellm.ai/docs/providers/mistral)                    | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                | ✅                                                                            |
 | [cloudflare AI Workers](https://docs.litellm.ai/docs/providers/cloudflare_workers)  | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
 | [cohere](https://docs.litellm.ai/docs/providers/cohere)                             | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                | ✅                                                                            |
@ -220,7 +220,7 @@ curl 'http://0.0.0.0:4000/key/generate' \
 | [nlp_cloud](https://docs.litellm.ai/docs/providers/nlp_cloud)                       | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
 | [aleph alpha](https://docs.litellm.ai/docs/providers/aleph_alpha)                   | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
 | [petals](https://docs.litellm.ai/docs/providers/petals)                             | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
-| [ollama](https://docs.litellm.ai/docs/providers/ollama)                             | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
+| [ollama](https://docs.litellm.ai/docs/providers/ollama)                             | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                | ✅                                                                            |
 | [deepinfra](https://docs.litellm.ai/docs/providers/deepinfra)                       | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
 | [perplexity-ai](https://docs.litellm.ai/docs/providers/perplexity)                  | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
 | [Groq AI](https://docs.litellm.ai/docs/providers/groq)                              | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
--- a/cookbook/Proxy_Batch_Users.ipynb
+++ b/cookbook/Proxy_Batch_Users.ipynb
@ -0,0 +1,204 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "680oRk1af-xJ"
+      },
+      "source": [
+        "# Environment Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "X7TgJFn8f88p"
+      },
+      "outputs": [],
+      "source": [
+        "import csv\n",
+        "from typing import Optional\n",
+        "import httpx, json\n",
+        "import asyncio\n",
+        "\n",
+        "proxy_base_url = \"http://0.0.0.0:4000\" # 👈 SET TO PROXY URL\n",
+        "master_key = \"sk-1234\" # 👈 SET TO PROXY MASTER KEY"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "rauw8EOhgBz5"
+      },
+      "outputs": [],
+      "source": [
+        "## GLOBAL HTTP CLIENT ## - faster http calls\n",
+        "class HTTPHandler:\n",
+        "    def __init__(self, concurrent_limit=1000):\n",
+        "        # Create a client with a connection pool\n",
+        "        self.client = httpx.AsyncClient(\n",
+        "            limits=httpx.Limits(\n",
+        "                max_connections=concurrent_limit,\n",
+        "                max_keepalive_connections=concurrent_limit,\n",
+        "            )\n",
+        "        )\n",
+        "\n",
+        "    async def close(self):\n",
+        "        # Close the client when you're done with it\n",
+        "        await self.client.aclose()\n",
+        "\n",
+        "    async def get(\n",
+        "        self, url: str, params: Optional[dict] = None, headers: Optional[dict] = None\n",
+        "    ):\n",
+        "        response = await self.client.get(url, params=params, headers=headers)\n",
+        "        return response\n",
+        "\n",
+        "    async def post(\n",
+        "        self,\n",
+        "        url: str,\n",
+        "        data: Optional[dict] = None,\n",
+        "        params: Optional[dict] = None,\n",
+        "        headers: Optional[dict] = None,\n",
+        "    ):\n",
+        "        try:\n",
+        "            response = await self.client.post(\n",
+        "                url, data=data, params=params, headers=headers\n",
+        "            )\n",
+        "            return response\n",
+        "        except Exception as e:\n",
+        "            raise e\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "7LXN8zaLgOie"
+      },
+      "source": [
+        "# Import Sheet\n",
+        "\n",
+        "\n",
+        "Format: | ID | Name | Max Budget |"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "oiED0usegPGf"
+      },
+      "outputs": [],
+      "source": [
+        "async def import_sheet():\n",
+        "    tasks = []\n",
+        "    http_client = HTTPHandler()\n",
+        "    with open('my-batch-sheet.csv', 'r') as file:\n",
+        "        csv_reader = csv.DictReader(file)\n",
+        "        for row in csv_reader:\n",
+        "            task = create_user(client=http_client, user_id=row['ID'], max_budget=row['Max Budget'], user_name=row['Name'])\n",
+        "            tasks.append(task)\n",
+        "            # print(f\"ID: {row['ID']}, Name: {row['Name']}, Max Budget: {row['Max Budget']}\")\n",
+        "\n",
+        "    keys = await asyncio.gather(*tasks)\n",
+        "\n",
+        "    with open('my-batch-sheet_new.csv', 'w', newline='') as new_file:\n",
+        "        fieldnames = ['ID', 'Name', 'Max Budget', 'keys']\n",
+        "        csv_writer = csv.DictWriter(new_file, fieldnames=fieldnames)\n",
+        "        csv_writer.writeheader()\n",
+        "\n",
+        "        with open('my-batch-sheet.csv', 'r') as file:\n",
+        "            csv_reader = csv.DictReader(file)\n",
+        "            for i, row in enumerate(csv_reader):\n",
+        "                row['keys'] = keys[i]  # Add the 'keys' value from the corresponding task result\n",
+        "                csv_writer.writerow(row)\n",
+        "\n",
+        "    await http_client.close()\n",
+        "\n",
+        "asyncio.run(import_sheet())"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "E7M0Li_UgJeZ"
+      },
+      "source": [
+        "# Create Users + Keys\n",
+        "\n",
+        "- Creates a user\n",
+        "- Creates a key with max budget"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "NZudRFujf7j-"
+      },
+      "outputs": [],
+      "source": [
+        "\n",
+        "async def create_key_with_alias(client: HTTPHandler, user_id: str, max_budget: float):\n",
+        "    global proxy_base_url\n",
+        "    if not proxy_base_url.endswith(\"/\"):\n",
+        "        proxy_base_url += \"/\"\n",
+        "    url = proxy_base_url + \"key/generate\"\n",
+        "\n",
+        "    # call /key/generate\n",
+        "    print(\"CALLING /KEY/GENERATE\")\n",
+        "    response = await client.post(\n",
+        "        url=url,\n",
+        "        headers={\"Authorization\": f\"Bearer {master_key}\"},\n",
+        "        data=json.dumps({\n",
+        "            \"user_id\": user_id,\n",
+        "            \"key_alias\": f\"{user_id}-key\",\n",
+        "            \"max_budget\": max_budget # 👈 KEY CHANGE: SETS MAX BUDGET PER KEY\n",
+        "        })\n",
+        "    )\n",
+        "    print(f\"response: {response.text}\")\n",
+        "    return response.json()[\"key\"]\n",
+        "\n",
+        "async def create_user(client: HTTPHandler, user_id: str, max_budget: float, user_name: str):\n",
+        "    \"\"\"\n",
+        "    - call /user/new\n",
+        "    - create key for user\n",
+        "    \"\"\"\n",
+        "    global proxy_base_url\n",
+        "    if not proxy_base_url.endswith(\"/\"):\n",
+        "        proxy_base_url += \"/\"\n",
+        "    url = proxy_base_url + \"user/new\"\n",
+        "\n",
+        "    # call /user/new\n",
+        "    await client.post(\n",
+        "        url=url,\n",
+        "        headers={\"Authorization\": f\"Bearer {master_key}\"},\n",
+        "        data=json.dumps({\n",
+        "            \"user_id\": user_id,\n",
+        "            \"user_alias\": user_name,\n",
+        "            \"auto_create_key\": False,\n",
+        "            # \"max_budget\": max_budget # 👈 [OPTIONAL] Sets max budget per user (if you want to set a max budget across keys)\n",
+        "        })\n",
+        "    )\n",
+        "\n",
+        "    # create key for user\n",
+        "    return await create_key_with_alias(client=client, user_id=user_id, max_budget=max_budget)\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
--- a/cookbook/benchmark/readme.md
+++ b/cookbook/benchmark/readme.md
@ -87,6 +87,7 @@
 | command-light | cohere | 0.00003 |
 | command-medium-beta | cohere | 0.00003 |
 | command-xlarge-beta | cohere | 0.00003 |
+| command-r-plus| cohere | 0.000018 |
 | j2-ultra | ai21 | 0.00003 |
 | ai21.j2-ultra-v1 | bedrock | 0.0000376 |
 | gpt-4-1106-preview | openai | 0.00004 |
--- a/cookbook/misc/config.yaml
+++ b/cookbook/misc/config.yaml
@ -0,0 +1,73 @@
+model_list:
+  - model_name: gpt-3.5-turbo
+    litellm_params:
+      model: azure/chatgpt-v-2
+      api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
+      api_version: "2023-05-15"
+      api_key: os.environ/AZURE_API_KEY # The `os.environ/` prefix tells litellm to read this from the env. See https://docs.litellm.ai/docs/simple_proxy#load-api-keys-from-vault
+  - model_name: gpt-3.5-turbo-large
+    litellm_params: 
+      model: "gpt-3.5-turbo-1106"
+      api_key: os.environ/OPENAI_API_KEY
+      rpm: 480
+      timeout: 300
+      stream_timeout: 60
+  - model_name: gpt-4
+    litellm_params:
+      model: azure/chatgpt-v-2
+      api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
+      api_version: "2023-05-15"
+      api_key: os.environ/AZURE_API_KEY # The `os.environ/` prefix tells litellm to read this from the env. See https://docs.litellm.ai/docs/simple_proxy#load-api-keys-from-vault
+      rpm: 480
+      timeout: 300
+      stream_timeout: 60
+  - model_name: sagemaker-completion-model
+    litellm_params:
+      model: sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4
+      input_cost_per_second: 0.000420  
+  - model_name: text-embedding-ada-002
+    litellm_params: 
+      model: azure/azure-embedding-model
+      api_key: os.environ/AZURE_API_KEY
+      api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
+      api_version: "2023-05-15"
+    model_info:
+      mode: embedding
+      base_model: text-embedding-ada-002
+  - model_name: dall-e-2
+    litellm_params:
+      model: azure/
+      api_version: 2023-06-01-preview
+      api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
+      api_key: os.environ/AZURE_API_KEY
+  - model_name: openai-dall-e-3
+    litellm_params:
+      model: dall-e-3
+  - model_name: fake-openai-endpoint
+    litellm_params:
+      model: openai/fake
+      api_key: fake-key
+      api_base: https://exampleopenaiendpoint-production.up.railway.app/
+
+litellm_settings:
+  drop_params: True
+  # max_budget: 100 
+  # budget_duration: 30d
+  num_retries: 5
+  request_timeout: 600
+  telemetry: False
+  context_window_fallbacks: [{"gpt-3.5-turbo": ["gpt-3.5-turbo-large"]}]
+
+general_settings: 
+  master_key: sk-1234 # [OPTIONAL] Use to enforce auth on proxy. See - https://docs.litellm.ai/docs/proxy/virtual_keys
+  store_model_in_db: True
+  proxy_budget_rescheduler_min_time: 60
+  proxy_budget_rescheduler_max_time: 64
+  proxy_batch_write_at: 1
+  # database_url: "postgresql://<user>:<password>@<host>:<port>/<dbname>" # [OPTIONAL] use for token-based auth to proxy
+
+# environment_variables:
+  # settings for using redis caching
+  # REDIS_HOST: redis-16337.c322.us-east-1-2.ec2.cloud.redislabs.com
+  # REDIS_PORT: "16337"
+  # REDIS_PASSWORD: 
--- a/cookbook/misc/migrate_proxy_config.py
+++ b/cookbook/misc/migrate_proxy_config.py
@ -0,0 +1,92 @@
+"""
+LiteLLM Migration Script!
+
+Takes a config.yaml and calls /model/new 
+
+Inputs:
+    - File path to config.yaml
+    - Proxy base url to your hosted proxy
+
+Step 1: Reads your config.yaml
+Step 2: reads `model_list` and loops through all models 
+Step 3: calls `<proxy-base-url>/model/new` for each model
+"""
+
+import yaml
+import requests
+
+_in_memory_os_variables = {}
+
+
+def migrate_models(config_file, proxy_base_url):
+    # Step 1: Read the config.yaml file
+    with open(config_file, "r") as f:
+        config = yaml.safe_load(f)
+
+    # Step 2: Read the model_list and loop through all models
+    model_list = config.get("model_list", [])
+    print("model_list: ", model_list)
+    for model in model_list:
+
+        model_name = model.get("model_name")
+        print("\nAdding model: ", model_name)
+        litellm_params = model.get("litellm_params", {})
+        api_base = litellm_params.get("api_base", "")
+        print("api_base on config.yaml: ", api_base)
+
+        litellm_model_name = litellm_params.get("model", "") or ""
+        if "vertex_ai/" in litellm_model_name:
+            print(f"\033[91m\nSkipping Vertex AI model\033[0m", model)
+            continue
+
+        for param, value in litellm_params.items():
+            if isinstance(value, str) and value.startswith("os.environ/"):
+                # check if value is in _in_memory_os_variables
+                if value in _in_memory_os_variables:
+                    new_value = _in_memory_os_variables[value]
+                    print(
+                        "\033[92mAlready entered value for \033[0m",
+                        value,
+                        "\033[92musing \033[0m",
+                        new_value,
+                    )
+                else:
+                    new_value = input(f"Enter value for {value}: ")
+                    _in_memory_os_variables[value] = new_value
+                litellm_params[param] = new_value
+
+        print("\nlitellm_params: ", litellm_params)
+        # Confirm before sending POST request
+        confirm = input(
+            "\033[92mDo you want to send the POST request with the above parameters? (y/n): \033[0m"
+        )
+        if confirm.lower() != "y":
+            print("Aborting POST request.")
+            exit()
+
+        # Step 3: Call <proxy-base-url>/model/new for each model
+        url = f"{proxy_base_url}/model/new"
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {master_key}",
+        }
+        data = {"model_name": model_name, "litellm_params": litellm_params}
+        print("POSTING data to proxy url", url)
+        response = requests.post(url, headers=headers, json=data)
+        if response.status_code != 200:
+            print(f"Error: {response.status_code} - {response.text}")
+            raise Exception(f"Error: {response.status_code} - {response.text}")
+
+        # Print the response for each model
+        print(
+            f"Response for model '{model_name}': Status Code:{response.status_code} - {response.text}"
+        )
+
+
+# Usage
+config_file = "config.yaml"
+proxy_base_url = "http://0.0.0.0:4000"
+master_key = "sk-1234"
+print(f"config_file: {config_file}")
+print(f"proxy_base_url: {proxy_base_url}")
+migrate_models(config_file, proxy_base_url)
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -1,10 +1,16 @@
 version: "3.9"
 services:
  litellm:
+    build:
+      context: .
+      args:
+        target: runtime
    image: ghcr.io/berriai/litellm:main-latest
-    volumes:
-      - ./proxy_server_config.yaml:/app/proxy_server_config.yaml # mount your litellm config.yaml
    ports:
-      - "4000:4000"
-    environment:
-      - AZURE_API_KEY=sk-123
+      - "4000:4000" # Map the container port to the host, change the host port if necessary
+    volumes:
+      - ./litellm-config.yaml:/app/config.yaml # Mount the local configuration file
+    # You can change the port or number of workers as per your requirements or pass any new supported CLI augument. Make sure the port passed here matches with the container port defined above in `ports` value
+    command: [ "--config", "/app/config.yaml", "--port", "4000", "--num_workers", "8" ]
+
+# ...rest of your docker-compose config if any
--- a/docs/my-website/docs/completion/prompt_formatting.md
+++ b/docs/my-website/docs/completion/prompt_formatting.md
@ -72,7 +72,7 @@ Here's the code for how we format all providers. Let us know how we can improve
 | Anthropic | `claude-instant-1`, `claude-instant-1.2`, `claude-2` | [Code](https://github.com/BerriAI/litellm/blob/721564c63999a43f96ee9167d0530759d51f8d45/litellm/llms/anthropic.py#L84)
 | OpenAI Text Completion | `text-davinci-003`, `text-curie-001`, `text-babbage-001`, `text-ada-001`, `babbage-002`, `davinci-002`, | [Code](https://github.com/BerriAI/litellm/blob/721564c63999a43f96ee9167d0530759d51f8d45/litellm/main.py#L442)
 | Replicate | all model names starting with `replicate/` | [Code](https://github.com/BerriAI/litellm/blob/721564c63999a43f96ee9167d0530759d51f8d45/litellm/llms/replicate.py#L180)
-| Cohere | `command-nightly`, `command`, `command-light`, `command-medium-beta`, `command-xlarge-beta` | [Code](https://github.com/BerriAI/litellm/blob/721564c63999a43f96ee9167d0530759d51f8d45/litellm/llms/cohere.py#L115)
+| Cohere | `command-nightly`, `command`, `command-light`, `command-medium-beta`, `command-xlarge-beta`, `command-r-plus` | [Code](https://github.com/BerriAI/litellm/blob/721564c63999a43f96ee9167d0530759d51f8d45/litellm/llms/cohere.py#L115)
 | Huggingface | all model names starting with `huggingface/` | [Code](https://github.com/BerriAI/litellm/blob/721564c63999a43f96ee9167d0530759d51f8d45/litellm/llms/huggingface_restapi.py#L186)
 | OpenRouter | all model names starting with `openrouter/` | [Code](https://github.com/BerriAI/litellm/blob/721564c63999a43f96ee9167d0530759d51f8d45/litellm/main.py#L611)
 | AI21 | `j2-mid`, `j2-light`, `j2-ultra` | [Code](https://github.com/BerriAI/litellm/blob/721564c63999a43f96ee9167d0530759d51f8d45/litellm/llms/ai21.py#L107)
--- a/docs/my-website/docs/completion/vision.md
+++ b/docs/my-website/docs/completion/vision.md
@ -0,0 +1,45 @@
+# Using Vision Models
+
+## Quick Start
+Example passing images to a model 
+
+```python
+import os 
+from litellm import completion
+
+os.environ["OPENAI_API_KEY"] = "your-api-key"
+
+# openai call
+response = completion(
+    model = "gpt-4-vision-preview", 
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                            {
+                                "type": "text",
+                                "text": "What’s in this image?"
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+                                }
+                            }
+                        ]
+        }
+    ],
+)
+
+```
+
+## Checking if a model supports `vision`
+
+Use `litellm.supports_vision(model="")` -> returns `True` if model supports `vision` and `False` if not
+
+```python
+assert litellm.supports_vision(model="gpt-4-vision-preview") == True
+assert litellm.supports_vision(model="gemini-1.0-pro-visionn") == True
+assert litellm.supports_vision(model="gpt-3.5-turbo") == False
+```
+
--- a/docs/my-website/docs/embedding/supported_embedding.md
+++ b/docs/my-website/docs/embedding/supported_embedding.md
@ -339,6 +339,8 @@ All models listed [here](https://github.com/BerriAI/litellm/blob/57f37f743886a02
 | textembedding-gecko-multilingual@001 | `embedding(model="vertex_ai/textembedding-gecko-multilingual@001", input)` | 
 | textembedding-gecko@001 | `embedding(model="vertex_ai/textembedding-gecko@001", input)` | 
 | textembedding-gecko@003 | `embedding(model="vertex_ai/textembedding-gecko@003", input)` | 
+| text-embedding-preview-0409 | `embedding(model="vertex_ai/text-embedding-preview-0409", input)` |
+| text-multilingual-embedding-preview-0409 | `embedding(model="vertex_ai/text-multilingual-embedding-preview-0409", input)` | 

 ## Voyage AI Embedding Models

--- a/docs/my-website/docs/enterprise.md
+++ b/docs/my-website/docs/enterprise.md
@ -1,5 +1,5 @@
 # Enterprise
-For companies that need better security, user management and professional support
+For companies that need SSO, user management and professional support for LiteLLM Proxy

 :::info

--- a/docs/my-website/docs/observability/callbacks.md
+++ b/docs/my-website/docs/observability/callbacks.md
@ -8,6 +8,7 @@ liteLLM supports:

 - [Custom Callback Functions](https://docs.litellm.ai/docs/observability/custom_callback)
 - [Lunary](https://lunary.ai/docs)
+- [Langfuse](https://langfuse.com/docs)
 - [Helicone](https://docs.helicone.ai/introduction)
 - [Traceloop](https://traceloop.com/docs)
 - [Athina](https://docs.athina.ai/)
@ -22,8 +23,8 @@ from litellm import completion

 # set callbacks
 litellm.input_callback=["sentry"] # for sentry breadcrumbing - logs the input being sent to the api
-litellm.success_callback=["posthog", "helicone", "lunary", "athina"]
-litellm.failure_callback=["sentry", "lunary"]
+litellm.success_callback=["posthog", "helicone", "langfuse", "lunary", "athina"]
+litellm.failure_callback=["sentry", "lunary", "langfuse"]

 ## set env variables
 os.environ['SENTRY_DSN'], os.environ['SENTRY_API_TRACE_RATE']= ""
@ -32,6 +33,9 @@ os.environ["HELICONE_API_KEY"] = ""
 os.environ["TRACELOOP_API_KEY"] = ""
 os.environ["LUNARY_PUBLIC_KEY"] = ""
 os.environ["ATHINA_API_KEY"] = ""
+os.environ["LANGFUSE_PUBLIC_KEY"] = ""
+os.environ["LANGFUSE_SECRET_KEY"] = ""
+os.environ["LANGFUSE_HOST"] = ""

 response = completion(model="gpt-3.5-turbo", messages=messages)
 ```
--- a/docs/my-website/docs/providers/anthropic.md
+++ b/docs/my-website/docs/providers/anthropic.md
@ -177,11 +177,7 @@ print(response)

 :::info 

-Claude returns it's output as an XML Tree. [Here is how we translate it](https://github.com/BerriAI/litellm/blob/49642a5b00a53b1babc1a753426a8afcac85dbbe/litellm/llms/prompt_templates/factory.py#L734).
-
-You can see the raw response via `response._hidden_params["original_response"]`.
-
-Claude hallucinates, e.g. returning the list param `value` as `<value>\n<item>apple</item>\n<item>banana</item>\n</value>` or `<value>\n<list>\n<item>apple</item>\n<item>banana</item>\n</list>\n</value>`.
+LiteLLM now uses Anthropic's 'tool' param 🎉 (v1.34.29+)
 :::

 ```python
--- a/docs/my-website/docs/providers/azure_ai.md
+++ b/docs/my-website/docs/providers/azure_ai.md
@ -1,55 +1,110 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
 # Azure AI Studio

-## Using Mistral models deployed on Azure AI Studio
+## Sample Usage

-### Sample Usage - setting env vars 
+**Ensure the following:**
+1. The API Base passed ends in the `/v1/` prefix
+  example:
+  ```python
+  api_base = "https://Mistral-large-dfgfj-serverless.eastus2.inference.ai.azure.com/v1/"
+  ```

-Set `MISTRAL_AZURE_API_KEY` and `MISTRAL_AZURE_API_BASE` in your env
+2. The `model` passed is listed in [supported models](#supported-models). You **DO NOT** Need to pass your deployment name to litellm. Example `model=azure/Mistral-large-nmefg`  

-```shell
-MISTRAL_AZURE_API_KEY = "zE************""
-MISTRAL_AZURE_API_BASE = "https://Mistral-large-nmefg-serverless.eastus2.inference.ai.azure.com/v1"
-```

+**Quick Start**
 ```python
-from litellm import completion
-import os
-
-response = completion(
-    model="mistral/Mistral-large-dfgfj", 
-    messages=[
-       {"role": "user", "content": "hello from litellm"}
-   ],
+import litellm
+response = litellm.completion(
+    model="azure/command-r-plus",
+    api_base="<your-deployment-base>/v1/"
+    api_key="eskk******"
+    messages=[{"role": "user", "content": "What is the meaning of life?"}],
 )
-print(response)
 ```

-### Sample Usage - passing `api_base` and `api_key` to `litellm.completion`
-```python
-from litellm import completion
-import os
+## Sample Usage - LiteLLM Proxy

-response = completion(
-    model="mistral/Mistral-large-dfgfj", 
-    api_base="https://Mistral-large-dfgfj-serverless.eastus2.inference.ai.azure.com",
-    api_key = "JGbKodRcTp****"
-    messages=[
-       {"role": "user", "content": "hello from litellm"}
-   ],
-)
-print(response)
-```
+1. Add models to your config.yaml

-### [LiteLLM Proxy] Using Mistral Models 
-
-Set this on your litellm proxy config.yaml
-```yaml
-model_list:
+  ```yaml
+  model_list:
    - model_name: mistral
      litellm_params:
-      model: mistral/Mistral-large-dfgfj
-      api_base: https://Mistral-large-dfgfj-serverless.eastus2.inference.ai.azure.com
+        model: azure/mistral-large-latest
+        api_base: https://Mistral-large-dfgfj-serverless.eastus2.inference.ai.azure.com/v1/
        api_key: JGbKodRcTp****
-```
+    - model_name: command-r-plus
+      litellm_params:
+          model: azure/command-r-plus
+          api_key: os.environ/AZURE_COHERE_API_KEY
+          api_base: os.environ/AZURE_COHERE_API_BASE
+  ```
+
+
+
+2. Start the proxy 
+
+  ```bash
+  $ litellm --config /path/to/config.yaml
+  ```
+
+3. Send Request to LiteLLM Proxy Server
+
+  <Tabs>
+
+  <TabItem value="openai" label="OpenAI Python v1.0.0+">
+
+  ```python
+  import openai
+  client = openai.OpenAI(
+      api_key="sk-1234",             # pass litellm proxy key, if you're using virtual keys
+      base_url="http://0.0.0.0:4000" # litellm-proxy-base url
+  )
+
+  response = client.chat.completions.create(
+      model="mistral",
+      messages = [
+          {
+              "role": "user",
+              "content": "what llm are you"
+          }
+      ],
+  )
+
+  print(response)
+  ```
+  </TabItem>
+
+  <TabItem value="curl" label="curl">
+
+  ```shell
+  curl --location 'http://0.0.0.0:4000/chat/completions' \
+      --header 'Authorization: Bearer sk-1234' \
+      --header 'Content-Type: application/json' \
+      --data '{
+      "model": "mistral",
+      "messages": [
+          {
+          "role": "user",
+          "content": "what llm are you"
+          }
+      ],
+  }'
+  ```
+  </TabItem>
+
+  </Tabs>
+
+## Supported Models
+
+| Model Name               | Function Call                                                                                                                                                      |
+|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Cohere command-r-plus | `completion(model="azure/command-r-plus", messages)` | 
+| Cohere ommand-r | `completion(model="azure/command-r", messages)` | 
+| mistral-large-latest | `completion(model="azure/mistral-large-latest", messages)` | 


--- a/docs/my-website/docs/providers/cohere.md
+++ b/docs/my-website/docs/providers/cohere.md
@ -47,6 +47,7 @@ for chunk in response:
 |------------|----------------|
 | command-r | `completion('command-r', messages)` |
 | command-light | `completion('command-light', messages)` |  
+| command-r-plus | `completion('command-r-plus', messages)` |  
 | command-medium | `completion('command-medium', messages)` |
 | command-medium-beta | `completion('command-medium-beta', messages)` |
 | command-xlarge-nightly | `completion('command-xlarge-nightly', messages)` |
--- a/docs/my-website/docs/providers/gemini.md
+++ b/docs/my-website/docs/providers/gemini.md
@ -96,8 +96,7 @@ print(content)

 ## Chat Models
 | Model Name            | Function Call                                          | Required OS Variables          |
-|------------------|--------------------------------------|-------------------------|
+|-----------------------|--------------------------------------------------------|--------------------------------|
 | gemini-pro            | `completion('gemini/gemini-pro', messages)`            | `os.environ['GEMINI_API_KEY']` |
-| gemini-1.5-pro       | `completion('gemini/gemini-1.5-pro', messages)` | `os.environ['GEMINI_API_KEY']` |
+| gemini-1.5-pro-latest | `completion('gemini/gemini-1.5-pro-latest', messages)` | `os.environ['GEMINI_API_KEY']` |
 | gemini-pro-vision     | `completion('gemini/gemini-pro-vision', messages)`     | `os.environ['GEMINI_API_KEY']` |
-| gemini-1.5-pro-vision       | `completion('gemini/gemini-1.5-pro-vision', messages)` | `os.environ['GEMINI_API_KEY']` |
--- a/docs/my-website/docs/providers/groq.md
+++ b/docs/my-website/docs/providers/groq.md
@ -51,3 +51,104 @@ We support ALL Groq models, just set `groq/` as a prefix when sending completion
 | llama2-70b-4096 | `completion(model="groq/llama2-70b-4096", messages)` | 
 | mixtral-8x7b-32768 | `completion(model="groq/mixtral-8x7b-32768", messages)` |
 | gemma-7b-it | `completion(model="groq/gemma-7b-it", messages)` |  
+
+## Groq - Tool / Function Calling Example
+
+```python
+# Example dummy function hard coded to return the current weather
+import json
+def get_current_weather(location, unit="fahrenheit"):
+    """Get the current weather in a given location"""
+    if "tokyo" in location.lower():
+        return json.dumps({"location": "Tokyo", "temperature": "10", "unit": "celsius"})
+    elif "san francisco" in location.lower():
+        return json.dumps(
+            {"location": "San Francisco", "temperature": "72", "unit": "fahrenheit"}
+        )
+    elif "paris" in location.lower():
+        return json.dumps({"location": "Paris", "temperature": "22", "unit": "celsius"})
+    else:
+        return json.dumps({"location": location, "temperature": "unknown"})
+
+
+
+
+# Step 1: send the conversation and available functions to the model
+messages = [
+    {
+        "role": "system",
+        "content": "You are a function calling LLM that uses the data extracted from get_current_weather to answer questions about the weather in San Francisco.",
+    },
+    {
+        "role": "user",
+        "content": "What's the weather like in San Francisco?",
+    },
+]
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city and state, e.g. San Francisco, CA",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
+                },
+                "required": ["location"],
+            },
+        },
+    }
+]
+response = litellm.completion(
+    model="groq/llama2-70b-4096",
+    messages=messages,
+    tools=tools,
+    tool_choice="auto",  # auto is default, but we'll be explicit
+)
+print("Response\n", response)
+response_message = response.choices[0].message
+tool_calls = response_message.tool_calls
+
+
+# Step 2: check if the model wanted to call a function
+if tool_calls:
+    # Step 3: call the function
+    # Note: the JSON response may not always be valid; be sure to handle errors
+    available_functions = {
+        "get_current_weather": get_current_weather,
+    }
+    messages.append(
+        response_message
+    )  # extend conversation with assistant's reply
+    print("Response message\n", response_message)
+    # Step 4: send the info for each function call and function response to the model
+    for tool_call in tool_calls:
+        function_name = tool_call.function.name
+        function_to_call = available_functions[function_name]
+        function_args = json.loads(tool_call.function.arguments)
+        function_response = function_to_call(
+            location=function_args.get("location"),
+            unit=function_args.get("unit"),
+        )
+        messages.append(
+            {
+                "tool_call_id": tool_call.id,
+                "role": "tool",
+                "name": function_name,
+                "content": function_response,
+            }
+        )  # extend conversation with function response
+    print(f"messages: {messages}")
+    second_response = litellm.completion(
+        model="groq/llama2-70b-4096", messages=messages
+    )  # get a new response from the model where it can see the function response
+    print("second response\n", second_response)
+```
--- a/docs/my-website/docs/providers/openai.md
+++ b/docs/my-website/docs/providers/openai.md
@ -2,7 +2,7 @@ import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

 # OpenAI
-LiteLLM supports OpenAI Chat + Text completion and embedding calls.
+LiteLLM supports OpenAI Chat + Embedding calls.

 ### Required API Keys

@ -44,7 +44,11 @@ export OPENAI_API_KEY=""
 model_list:
  - model_name: gpt-3.5-turbo
    litellm_params:
-      model: gpt-3.5-turbo
+      model: openai/gpt-3.5-turbo                          # The `openai/` prefix will call openai.chat.completions.create
+      api_key: os.environ/OPENAI_API_KEY
+  - model_name: gpt-3.5-turbo-instruct
+    litellm_params:
+      model: text-completion-openai/gpt-3.5-turbo-instruct # The `text-completion-openai/` prefix will call openai.completions.create
      api_key: os.environ/OPENAI_API_KEY
 ```
 </TabItem>
@ -159,6 +163,7 @@ os.environ["OPENAI_API_BASE"] = "openaiai-api-base"     # OPTIONAL

 | Model Name            | Function Call                                                   |
 |-----------------------|-----------------------------------------------------------------|
+| gpt-4-turbo-preview   | `response = completion(model="gpt-4-0125-preview", messages=messages)` |
 | gpt-4-0125-preview    | `response = completion(model="gpt-4-0125-preview", messages=messages)` |
 | gpt-4-1106-preview    | `response = completion(model="gpt-4-1106-preview", messages=messages)` |
 | gpt-3.5-turbo-1106    | `response = completion(model="gpt-3.5-turbo-1106", messages=messages)` |
@ -213,19 +218,6 @@ response = completion(

 ```

-## OpenAI Text Completion Models / Instruct Models
-
-| Model Name          | Function Call                                      |
-|---------------------|----------------------------------------------------|
-| gpt-3.5-turbo-instruct | `response = completion(model="gpt-3.5-turbo-instruct", messages=messages)` |
-| gpt-3.5-turbo-instruct-0914 | `response = completion(model="gpt-3.5-turbo-instruct-091", messages=messages)` |
-| text-davinci-003    | `response = completion(model="text-davinci-003", messages=messages)` |
-| ada-001             | `response = completion(model="ada-001", messages=messages)` |
-| curie-001           | `response = completion(model="curie-001", messages=messages)` |
-| babbage-001         | `response = completion(model="babbage-001", messages=messages)` |
-| babbage-002         | `response = completion(model="babbage-002", messages=messages)` |
-| davinci-002         | `response = completion(model="davinci-002", messages=messages)` |
-
 ## Advanced

 ### Parallel Function calling
--- a/docs/my-website/docs/providers/openai_compatible.md
+++ b/docs/my-website/docs/providers/openai_compatible.md
@ -5,7 +5,9 @@ import TabItem from '@theme/TabItem';

 To call models hosted behind an openai proxy, make 2 changes:

-1. Put `openai/` in front of your model name, so litellm knows you're trying to call an openai-compatible endpoint. 
+1. For `/chat/completions`: Put `openai/` in front of your model name, so litellm knows you're trying to call an openai `/chat/completions` endpoint. 
+
+2. For `/completions`: Put `text-completion-openai/` in front of your model name, so litellm knows you're trying to call an openai `/completions` endpoint. 

 2. **Do NOT** add anything additional to the base url e.g. `/v1/embedding`. LiteLLM uses the openai-client to make these calls, and that automatically adds the relevant endpoints. 

--- a/docs/my-website/docs/providers/text_completion_openai.md
+++ b/docs/my-website/docs/providers/text_completion_openai.md
@ -0,0 +1,163 @@
+# OpenAI (Text Completion)
+
+LiteLLM supports OpenAI text completion models
+
+### Required API Keys
+
+```python
+import os 
+os.environ["OPENAI_API_KEY"] = "your-api-key"
+```
+
+### Usage
+```python
+import os 
+from litellm import completion
+
+os.environ["OPENAI_API_KEY"] = "your-api-key"
+
+# openai call
+response = completion(
+    model = "gpt-3.5-turbo-instruct", 
+    messages=[{ "content": "Hello, how are you?","role": "user"}]
+)
+```
+
+### Usage - LiteLLM Proxy Server
+
+Here's how to call OpenAI models with the LiteLLM Proxy Server
+
+### 1. Save key in your environment
+
+```bash
+export OPENAI_API_KEY=""
+```
+
+### 2. Start the proxy 
+
+<Tabs>
+<TabItem value="config" label="config.yaml">
+
+```yaml
+model_list:
+  - model_name: gpt-3.5-turbo
+    litellm_params:
+      model: openai/gpt-3.5-turbo                          # The `openai/` prefix will call openai.chat.completions.create
+      api_key: os.environ/OPENAI_API_KEY
+  - model_name: gpt-3.5-turbo-instruct
+    litellm_params:
+      model: text-completion-openai/gpt-3.5-turbo-instruct # The `text-completion-openai/` prefix will call openai.completions.create
+      api_key: os.environ/OPENAI_API_KEY
+```
+</TabItem>
+<TabItem value="config-*" label="config.yaml - proxy all OpenAI models">
+
+Use this to add all openai models with one API Key. **WARNING: This will not do any load balancing**
+This means requests to `gpt-4`, `gpt-3.5-turbo` , `gpt-4-turbo-preview` will all go through this route 
+
+```yaml
+model_list:
+  - model_name: "*"             # all requests where model not in your config go to this deployment
+    litellm_params:
+      model: openai/*           # set `openai/` to use the openai route
+      api_key: os.environ/OPENAI_API_KEY
+```
+</TabItem>
+<TabItem value="cli" label="CLI">
+
+```bash
+$ litellm --model gpt-3.5-turbo-instruct
+
+# Server running on http://0.0.0.0:4000
+```
+</TabItem>
+
+</Tabs>
+
+### 3. Test it
+
+
+<Tabs>
+<TabItem value="Curl" label="Curl Request">
+
+```shell
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+--header 'Content-Type: application/json' \
+--data ' {
+      "model": "gpt-3.5-turbo-instruct",
+      "messages": [
+        {
+          "role": "user",
+          "content": "what llm are you"
+        }
+      ]
+    }
+'
+```
+</TabItem>
+<TabItem value="openai" label="OpenAI v1.0.0+">
+
+```python
+import openai
+client = openai.OpenAI(
+    api_key="anything",
+    base_url="http://0.0.0.0:4000"
+)
+
+# request sent to model set on litellm proxy, `litellm --model`
+response = client.chat.completions.create(model="gpt-3.5-turbo-instruct", messages = [
+    {
+        "role": "user",
+        "content": "this is a test request, write a short poem"
+    }
+])
+
+print(response)
+
+```
+</TabItem>
+<TabItem value="langchain" label="Langchain">
+
+```python
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts.chat import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+    SystemMessagePromptTemplate,
+)
+from langchain.schema import HumanMessage, SystemMessage
+
+chat = ChatOpenAI(
+    openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
+    model = "gpt-3.5-turbo-instruct",
+    temperature=0.1
+)
+
+messages = [
+    SystemMessage(
+        content="You are a helpful assistant that im using to make a test request to."
+    ),
+    HumanMessage(
+        content="test from litellm. tell me why it's amazing in 1 sentence"
+    ),
+]
+response = chat(messages)
+
+print(response)
+```
+</TabItem>
+</Tabs>
+
+
+## OpenAI Text Completion Models / Instruct Models
+
+| Model Name          | Function Call                                      |
+|---------------------|----------------------------------------------------|
+| gpt-3.5-turbo-instruct | `response = completion(model="gpt-3.5-turbo-instruct", messages=messages)` |
+| gpt-3.5-turbo-instruct-0914 | `response = completion(model="gpt-3.5-turbo-instruct-0914", messages=messages)` |
+| text-davinci-003    | `response = completion(model="text-davinci-003", messages=messages)` |
+| ada-001             | `response = completion(model="ada-001", messages=messages)` |
+| curie-001           | `response = completion(model="curie-001", messages=messages)` |
+| babbage-001         | `response = completion(model="babbage-001", messages=messages)` |
+| babbage-002         | `response = completion(model="babbage-002", messages=messages)` |
+| davinci-002         | `response = completion(model="davinci-002", messages=messages)` |
--- a/docs/my-website/docs/providers/vertex.md
+++ b/docs/my-website/docs/providers/vertex.md
@ -1,7 +1,8 @@
+import Image from '@theme/IdealImage';
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# VertexAI - Google [Gemini, Model Garden]
+# VertexAI [Anthropic, Gemini, Model Garden]

 <a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/liteLLM_VertextAI_Example.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
@ -129,6 +130,100 @@ Here's how to use Vertex AI with the LiteLLM Proxy Server

  </Tabs>

+## Specifying Safety Settings 
+In certain use-cases you may need to make calls to the models and pass [safety settigns](https://ai.google.dev/docs/safety_setting_gemini) different from the defaults. To do so, simple pass the `safety_settings` argument to `completion` or `acompletion`. For example:
+
+
+<Tabs>
+
+<TabItem value="sdk" label="SDK">
+
+```python
+response = completion(
+    model="gemini/gemini-pro", 
+    messages=[{"role": "user", "content": "write code for saying hi from LiteLLM"}]
+    safety_settings=[
+        {
+            "category": "HARM_CATEGORY_HARASSMENT",
+            "threshold": "BLOCK_NONE",
+        },
+        {
+            "category": "HARM_CATEGORY_HATE_SPEECH",
+            "threshold": "BLOCK_NONE",
+        },
+        {
+            "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
+            "threshold": "BLOCK_NONE",
+        },
+        {
+            "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
+            "threshold": "BLOCK_NONE",
+        },
+    ]
+)
+```
+</TabItem>
+<TabItem value="proxy" label="Proxy">
+
+**Option 1: Set in config**
+```yaml
+model_list:
+  - model_name: gemini-experimental
+    litellm_params:
+      model: vertex_ai/gemini-experimental
+      vertex_project: litellm-epic
+      vertex_location: us-central1
+      safety_settings:
+      - category: HARM_CATEGORY_HARASSMENT
+        threshold: BLOCK_NONE
+      - category: HARM_CATEGORY_HATE_SPEECH
+        threshold: BLOCK_NONE
+      - category: HARM_CATEGORY_SEXUALLY_EXPLICIT
+        threshold: BLOCK_NONE
+      - category: HARM_CATEGORY_DANGEROUS_CONTENT
+        threshold: BLOCK_NONE
+```
+
+**Option 2: Set on call**
+
+```python
+response = client.chat.completions.create(
+    model="gemini-experimental",
+    messages=[
+        {
+            "role": "user",
+            "content": "Can you write exploits?",
+        }
+    ],
+    max_tokens=8192,
+    stream=False,
+    temperature=0.0,
+
+    extra_body={
+        "safety_settings": [
+            {
+                "category": "HARM_CATEGORY_HARASSMENT",
+                "threshold": "BLOCK_NONE",
+            },
+            {
+                "category": "HARM_CATEGORY_HATE_SPEECH",
+                "threshold": "BLOCK_NONE",
+            },
+            {
+                "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
+                "threshold": "BLOCK_NONE",
+            },
+            {
+                "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
+                "threshold": "BLOCK_NONE",
+            },
+        ],
+    }
+)
+```
+</TabItem>
+</Tabs>
+
 ## Set Vertex Project & Vertex Location
 All calls using Vertex AI require the following parameters:
 * Your Project ID
@ -155,6 +250,84 @@ os.environ["VERTEXAI_LOCATION"] = "us-central1 # Your Location
 # set directly on module 
 litellm.vertex_location = "us-central1 # Your Location
 ```
+## Anthropic 
+| Model Name       | Function Call                        |
+|------------------|--------------------------------------|
+| claude-3-sonnet@20240229   | `completion('vertex_ai/claude-3-sonnet@20240229', messages)` |
+| claude-3-haiku@20240307   | `completion('vertex_ai/claude-3-haiku@20240307', messages)` |
+
+### Usage
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+import os
+
+os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = ""
+
+model = "claude-3-sonnet@20240229"
+
+vertex_ai_project = "your-vertex-project" # can also set this as os.environ["VERTEXAI_PROJECT"]
+vertex_ai_location = "your-vertex-location" # can also set this as os.environ["VERTEXAI_LOCATION"]
+
+response = completion(
+    model="vertex_ai/" + model,
+    messages=[{"role": "user", "content": "hi"}],
+    temperature=0.7,
+    vertex_ai_project=vertex_ai_project,
+    vertex_ai_location=vertex_ai_location,
+)
+print("\nModel Response", response)
+```
+</TabItem>
+<TabItem value="proxy" label="Proxy">
+
+**1. Add to config**
+
+```yaml
+model_list:
+    - model_name: anthropic-vertex
+      litellm_params:
+        model: vertex_ai/claude-3-sonnet@20240229
+        vertex_ai_project: "my-test-project"
+        vertex_ai_location: "us-east-1"
+    - model_name: anthropic-vertex
+      litellm_params:
+        model: vertex_ai/claude-3-sonnet@20240229
+        vertex_ai_project: "my-test-project"
+        vertex_ai_location: "us-west-1"
+```
+
+**2. Start proxy**
+
+```bash
+litellm --config /path/to/config.yaml
+
+# RUNNING at http://0.0.0.0:4000
+```
+
+**3. Test it!**
+
+```bash
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+      --header 'Authorization: Bearer sk-1234' \
+      --header 'Content-Type: application/json' \
+      --data '{
+            "model": "anthropic-vertex", # 👈 the 'model_name' in config
+            "messages": [
+                {
+                "role": "user",
+                "content": "what llm are you"
+                }
+            ],
+        }'
+```
+
+</TabItem>
+</Tabs>
+
 ## Model Garden
 | Model Name       | Function Call                        |
 |------------------|--------------------------------------|
@ -181,18 +354,15 @@ response = completion(
 |------------------|--------------------------------------|
 | gemini-pro   | `completion('gemini-pro', messages)`, `completion('vertex_ai/gemini-pro', messages)` |

-| Model Name       | Function Call                        |
-|------------------|--------------------------------------|
-| gemini-1.5-pro   | `completion('gemini-1.5-pro', messages)`, `completion('vertex_ai/gemini-pro', messages)` |
-
 ## Gemini Pro Vision
 | Model Name       | Function Call                        |
 |------------------|--------------------------------------|
 | gemini-pro-vision   | `completion('gemini-pro-vision', messages)`, `completion('vertex_ai/gemini-pro-vision', messages)`|

+## Gemini 1.5 Pro (and Vision)
 | Model Name       | Function Call                        |
 |------------------|--------------------------------------|
-| gemini-1.5-pro-vision   | `completion('gemini-pro-vision', messages)`, `completion('vertex_ai/gemini-pro-vision', messages)`|
+| gemini-1.5-pro   | `completion('gemini-1.5-pro', messages)`, `completion('vertex_ai/gemini-pro', messages)` |



@ -308,6 +478,7 @@ print(response)

 ## Extra

+### Using `GOOGLE_APPLICATION_CREDENTIALS`
 Here's the code for storing your service account credentials as `GOOGLE_APPLICATION_CREDENTIALS` environment variable:


@ -344,3 +515,34 @@ def load_vertex_ai_credentials():
  # Export the temporary file as GOOGLE_APPLICATION_CREDENTIALS
  os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.path.abspath(temp_file.name)
 ```
+
+
+### Using GCP Service Account 
+
+1. Figure out the Service Account bound to the Google Cloud Run service
+
+<Image img={require('../../img/gcp_acc_1.png')} />
+
+2. Get the FULL EMAIL address of the corresponding Service Account
+
+3. Next, go to IAM & Admin > Manage Resources , select your top-level project that houses your Google Cloud Run Service
+
+Click `Add Principal`
+
+<Image img={require('../../img/gcp_acc_2.png')}/>
+
+4. Specify the Service Account as the principal and Vertex AI User as the role
+
+<Image img={require('../../img/gcp_acc_3.png')}/>
+
+Once that's done, when you deploy the new container in the Google Cloud Run service, LiteLLM will have automatic access to all Vertex AI endpoints.
+
+
+s/o @[Darien Kindlund](https://www.linkedin.com/in/kindlund/) for this tutorial
+
+
+
+
+
+
+
--- a/docs/my-website/docs/providers/voyage.md
+++ b/docs/my-website/docs/providers/voyage.md
@ -25,8 +25,11 @@ All models listed here https://docs.voyageai.com/embeddings/#models-and-specific

 | Model Name               | Function Call                                                                                                                                                      |
 |--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| voyage-2 | `embedding(model="voyage/voyage-2", input)` | 
+| voyage-large-2 | `embedding(model="voyage/voyage-large-2", input)` | 
+| voyage-law-2 | `embedding(model="voyage/voyage-law-2", input)` | 
+| voyage-code-2 | `embedding(model="voyage/voyage-code-2", input)` | 
+| voyage-lite-02-instruct | `embedding(model="voyage/voyage-lite-02-instruct", input)` | 
 | voyage-01 | `embedding(model="voyage/voyage-01", input)` | 
 | voyage-lite-01 | `embedding(model="voyage/voyage-lite-01", input)` | 
 | voyage-lite-01-instruct | `embedding(model="voyage/voyage-lite-01-instruct", input)` | 
-
-
--- a/docs/my-website/docs/proxy/demo.md
+++ b/docs/my-website/docs/proxy/demo.md
@ -0,0 +1,9 @@
+# 🎉 Demo App
+
+Here is a demo of the proxy. To log in pass in:
+
+- Username: admin
+- Password: sk-1234
+
+
+[Demo UI](https://demo.litellm.ai/ui)
--- a/docs/my-website/docs/proxy/enterprise.md
+++ b/docs/my-website/docs/proxy/enterprise.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# ✨ Enterprise Features - Content Mod
+# ✨ Enterprise Features - Content Mod, SSO

 Features here are behind a commercial license in our `/enterprise` folder. [**See Code**](https://github.com/BerriAI/litellm/tree/main/enterprise)

@ -12,16 +12,18 @@ Features here are behind a commercial license in our `/enterprise` folder. [**Se
 :::

 Features: 
+- ✅ [SSO for Admin UI](./ui.md#✨-enterprise-features)
 - ✅ Content Moderation with LLM Guard
 - ✅ Content Moderation with LlamaGuard 
 - ✅ Content Moderation with Google Text Moderations 
 - ✅ Reject calls from Blocked User list 
 - ✅ Reject calls (incoming / outgoing) with Banned Keywords (e.g. competitors)
- ✅ Don't log/store specific requests (eg confidential LLM requests)
+- ✅ Don't log/store specific requests to Langfuse, Sentry, etc. (eg confidential LLM requests)
 - ✅ Tracking Spend for Custom Tags



+
 ## Content Moderation
 ### Content Moderation with LLM Guard

@ -74,7 +76,7 @@ curl --location 'http://localhost:4000/key/generate' \
 # Returns {..'key': 'my-new-key'}
 ```

-**2. Test it!**
+**3. Test it!**

 ```bash
 curl --location 'http://0.0.0.0:4000/v1/chat/completions' \
@ -87,6 +89,76 @@ curl --location 'http://0.0.0.0:4000/v1/chat/completions' \
    }'
 ```

+#### Turn on/off per request
+
+**1. Update config**
+```yaml
+litellm_settings:
+    callbacks: ["llmguard_moderations"]
+    llm_guard_mode: "request-specific"
+```
+
+**2. Create new key**
+
+```bash
+curl --location 'http://localhost:4000/key/generate' \
+--header 'Authorization: Bearer sk-1234' \
+--header 'Content-Type: application/json' \
+--data '{
+    "models": ["fake-openai-endpoint"],
+}'
+
+# Returns {..'key': 'my-new-key'}
+```
+
+**3. Test it!**
+
+<Tabs>
+<TabItem value="openai" label="OpenAI Python v1.0.0+">
+
+```python
+import openai
+client = openai.OpenAI(
+    api_key="sk-1234",
+    base_url="http://0.0.0.0:4000"
+)
+
+# request sent to model set on litellm proxy, `litellm --model`
+response = client.chat.completions.create(
+    model="gpt-3.5-turbo",
+    messages = [
+        {
+            "role": "user",
+            "content": "this is a test request, write a short poem"
+        }
+    ],
+    extra_body={ # pass in any provider-specific param, if not supported by openai, https://docs.litellm.ai/docs/completion/input#provider-specific-params
+        "metadata": {
+            "permissions": {
+                "enable_llm_guard_check": True # 👈 KEY CHANGE
+            },
+        }
+    }
+)
+
+print(response)
+```
+</TabItem>
+<TabItem value="curl" label="Curl Request">
+
+```bash
+curl --location 'http://0.0.0.0:4000/v1/chat/completions' \
+--header 'Content-Type: application/json' \
+--header 'Authorization: Bearer my-new-key' \ # 👈 TEST KEY
+--data '{"model": "fake-openai-endpoint", "messages": [
+        {"role": "system", "content": "Be helpful"},
+        {"role": "user", "content": "What do you know?"}
+    ]
+    }'
+```
+
+</TabItem>
+</Tabs>

 ### Content Moderation with LlamaGuard 

--- a/docs/my-website/docs/proxy/grafana_metrics.md
+++ b/docs/my-website/docs/proxy/grafana_metrics.md
@ -51,3 +51,21 @@ http://localhost:4000/metrics
 | `litellm_requests_metric`             | Number of requests made, per `"user", "key", "model"`          |
 | `litellm_spend_metric`                | Total Spend, per `"user", "key", "model"`                 |
 | `litellm_total_tokens`         | input + output tokens per `"user", "key", "model"`     |
+
+## Monitor System Health
+
+To monitor the health of litellm adjacent services (redis / postgres), do:
+
+```yaml
+model_list:
+ - model_name: gpt-3.5-turbo
+    litellm_params:
+      model: gpt-3.5-turbo
+litellm_settings:
+  service_callback: ["prometheus_system"]
+```
+
+| Metric Name          | Description                          |
+|----------------------|--------------------------------------|
+| `litellm_redis_latency`         | histogram latency for redis calls     |
+| `litellm_redis_fails`         | Number of failed redis calls    |
--- a/docs/my-website/docs/proxy/logging.md
+++ b/docs/my-website/docs/proxy/logging.md
@ -9,9 +9,9 @@ Log Proxy Input, Output, Exceptions using Custom Callbacks, Langfuse, OpenTeleme

 - [Async Custom Callbacks](#custom-callback-class-async)
 - [Async Custom Callback APIs](#custom-callback-apis-async)
- [Logging to DataDog](#logging-proxy-inputoutput---datadog)
 - [Logging to Langfuse](#logging-proxy-inputoutput---langfuse)
 - [Logging to s3 Buckets](#logging-proxy-inputoutput---s3-buckets)
+- [Logging to DataDog](#logging-proxy-inputoutput---datadog)
 - [Logging to DynamoDB](#logging-proxy-inputoutput---dynamodb)
 - [Logging to Sentry](#logging-proxy-inputoutput---sentry)
 - [Logging to Traceloop (OpenTelemetry)](#logging-proxy-inputoutput-traceloop-opentelemetry)
@ -539,6 +539,36 @@ print(response)
 </Tabs>


+### Team based Logging to Langfuse
+
+**Example:**
+
+This config would send langfuse logs to 2 different langfuse projects, based on the team id 
+
+```yaml
+litellm_settings:
+  default_team_settings: 
+    - team_id: my-secret-project
+      success_callback: ["langfuse"]
+      langfuse_public_key: os.environ/LANGFUSE_PUB_KEY_1 # Project 1
+      langfuse_secret: os.environ/LANGFUSE_PRIVATE_KEY_1 # Project 1
+    - team_id: ishaans-secret-project
+      success_callback: ["langfuse"]
+      langfuse_public_key: os.environ/LANGFUSE_PUB_KEY_2 # Project 2
+      langfuse_secret: os.environ/LANGFUSE_SECRET_2 # Project 2
+```
+
+Now, when you [generate keys](./virtual_keys.md) for this team-id 
+
+```bash
+curl -X POST 'http://0.0.0.0:4000/key/generate' \
+-H 'Authorization: Bearer sk-1234' \
+-H 'Content-Type: application/json' \
+-d '{"team_id": "ishaans-secret-project"}'
+```
+
+All requests made with these keys will log data to their team-specific logging.
+
 ## Logging Proxy Input/Output - DataDog
 We will use the `--config` to set `litellm.success_callback = ["datadog"]` this will log all successfull LLM calls to DataDog

--- a/docs/my-website/docs/proxy/team_based_routing.md
+++ b/docs/my-website/docs/proxy/team_based_routing.md
@ -99,7 +99,7 @@ Now, when you [generate keys](./virtual_keys.md) for this team-id
 curl -X POST 'http://0.0.0.0:4000/key/generate' \
 -H 'Authorization: Bearer sk-1234' \
 -H 'Content-Type: application/json' \
-D '{"team_id": "ishaans-secret-project"}'
+-d '{"team_id": "ishaans-secret-project"}'
 ```

 All requests made with these keys will log data to their team-specific logging.
--- a/docs/my-website/docs/proxy/token_auth.md
+++ b/docs/my-website/docs/proxy/token_auth.md
@ -108,6 +108,34 @@ general_settings:
  litellm_jwtauth:
    admin_jwt_scope: "litellm-proxy-admin"
 ```
+
+## Advanced - Spend Tracking (User / Team / Org)
+
+Set the field in the jwt token, which corresponds to a litellm user / team / org.
+
+```yaml
+general_settings:
+  master_key: sk-1234
+  enable_jwt_auth: True
+  litellm_jwtauth:
+    admin_jwt_scope: "litellm-proxy-admin"
+    team_id_jwt_field: "client_id" # 👈 CAN BE ANY FIELD
+    user_id_jwt_field: "sub" # 👈 CAN BE ANY FIELD
+    org_id_jwt_field: "org_id" # 👈 CAN BE ANY FIELD
+```
+
+Expected JWT: 
+
+```
+{
+  "client_id": "my-unique-team",
+  "sub": "my-unique-user",
+  "org_id": "my-unique-org"
+}
+```
+
+Now litellm will automatically update the spend for the user/team/org in the db for each call. 
+
 ### JWT Scopes

 Here's what scopes on JWT-Auth tokens look like
--- a/docs/my-website/docs/proxy/ui.md
+++ b/docs/my-website/docs/proxy/ui.md
@ -56,6 +56,9 @@ On accessing the LiteLLM UI, you will be prompted to enter your username, passwo

 ## ✨ Enterprise Features

+Features here are behind a commercial license in our `/enterprise` folder. [**See Code**](https://github.com/BerriAI/litellm/tree/main/enterprise)
+
+
 ### Setup SSO/Auth for UI

 #### Step 1: Set upperbounds for keys
--- a/docs/my-website/docs/routing.md
+++ b/docs/my-website/docs/routing.md
@ -95,12 +95,129 @@ print(response)
 - `router.image_generation()` - completion calls in OpenAI `/v1/images/generations` endpoint format
 - `router.aimage_generation()` - async image generation calls

-### Advanced
+### Advanced - Routing Strategies
 #### Routing Strategies - Weighted Pick, Rate Limit Aware, Least Busy, Latency Based

 Router provides 4 strategies for routing your calls across multiple deployments: 

 <Tabs>
+<TabItem value="usage-based-v2" label="Rate-Limit Aware v2 (ASYNC)">
+
+**🎉 NEW** This is an async implementation of usage-based-routing.
+
+**Filters out deployment if tpm/rpm limit exceeded** - If you pass in the deployment's tpm/rpm limits.
+
+Routes to **deployment with lowest TPM usage** for that minute. 
+
+In production, we use Redis to track usage (TPM/RPM) across multiple deployments. This implementation uses **async redis calls** (redis.incr and redis.mget).
+
+For Azure, your RPM = TPM/6. 
+
+<Tabs>
+<TabItem value="sdk" label="sdk">
+
+```python
+from litellm import Router 
+
+
+model_list = [{ # list of model deployments 
+	"model_name": "gpt-3.5-turbo", # model alias 
+	"litellm_params": { # params for litellm completion/embedding call 
+		"model": "azure/chatgpt-v-2", # actual model name
+		"api_key": os.getenv("AZURE_API_KEY"),
+		"api_version": os.getenv("AZURE_API_VERSION"),
+		"api_base": os.getenv("AZURE_API_BASE")
+	}, 
+    "tpm": 100000,
+	"rpm": 10000,
+}, {
+    "model_name": "gpt-3.5-turbo", 
+	"litellm_params": { # params for litellm completion/embedding call 
+		"model": "azure/chatgpt-functioncalling", 
+		"api_key": os.getenv("AZURE_API_KEY"),
+		"api_version": os.getenv("AZURE_API_VERSION"),
+		"api_base": os.getenv("AZURE_API_BASE")
+	},
+    "tpm": 100000,
+	"rpm": 1000,
+}, {
+    "model_name": "gpt-3.5-turbo", 
+	"litellm_params": { # params for litellm completion/embedding call 
+		"model": "gpt-3.5-turbo", 
+		"api_key": os.getenv("OPENAI_API_KEY"),
+	},
+    "tpm": 100000,
+	"rpm": 1000,
+}]
+router = Router(model_list=model_list, 
+                redis_host=os.environ["REDIS_HOST"], 
+				redis_password=os.environ["REDIS_PASSWORD"], 
+				redis_port=os.environ["REDIS_PORT"], 
+                routing_strategy="usage-based-routing-v2" # 👈 KEY CHANGE
+				enable_pre_call_check=True, # enables router rate limits for concurrent calls
+				)
+
+response = await router.acompletion(model="gpt-3.5-turbo", 
+				messages=[{"role": "user", "content": "Hey, how's it going?"}]
+
+print(response)
+```
+</TabItem>
+<TabItem value="proxy" label="proxy">
+
+**1. Set strategy in config**
+
+```yaml
+model_list:
+	- model_name: gpt-3.5-turbo # model alias 
+	  litellm_params: # params for litellm completion/embedding call 
+		model: azure/chatgpt-v-2 # actual model name
+		api_key: os.environ/AZURE_API_KEY
+		api_version: os.environ/AZURE_API_VERSION
+		api_base: os.environ/AZURE_API_BASE
+      tpm: 100000
+	  rpm: 10000
+	- model_name: gpt-3.5-turbo 
+	  litellm_params: # params for litellm completion/embedding call 
+		model: gpt-3.5-turbo 
+		api_key: os.getenv(OPENAI_API_KEY)
+      tpm: 100000
+	  rpm: 1000
+
+router_settings:
+  routing_strategy: usage-based-routing-v2 # 👈 KEY CHANGE
+  redis_host: <your-redis-host>
+  redis_password: <your-redis-password>
+  redis_port: <your-redis-port>
+  enable_pre_call_check: true
+
+general_settings:
+  master_key: sk-1234
+```
+
+**2. Start proxy**
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+**3. Test it!**
+
+```bash
+curl --location 'http://localhost:4000/v1/chat/completions' \
+--header 'Content-Type: application/json' \
+--header 'Authorization: Bearer sk-1234' \
+--data '{
+    "model": "gpt-3.5-turbo", 
+    "messages": [{"role": "user", "content": "Hey, how's it going?"}]
+}'
+```
+
+</TabItem>
+</Tabs>
+
+
+</TabItem>
 <TabItem value="latency-based" label="Latency-Based">


@ -117,7 +234,10 @@ import asyncio
 model_list = [{ ... }]

 # init router
-router = Router(model_list=model_list, routing_strategy="latency-based-routing") # 👈 set routing strategy
+router = Router(model_list=model_list,
+				routing_strategy="latency-based-routing",# 👈 set routing strategy
+				enable_pre_call_check=True, # enables router rate limits for concurrent calls
+				)

 ## CALL 1+2
 tasks = []
@ -257,8 +377,9 @@ router = Router(model_list=model_list,
                redis_host=os.environ["REDIS_HOST"], 
 				redis_password=os.environ["REDIS_PASSWORD"], 
 				redis_port=os.environ["REDIS_PORT"], 
-                routing_strategy="usage-based-routing")
-
+                routing_strategy="usage-based-routing"
+				enable_pre_call_check=True, # enables router rate limits for concurrent calls
+				)

 response = await router.acompletion(model="gpt-3.5-turbo", 
 				messages=[{"role": "user", "content": "Hey, how's it going?"}]
@ -555,7 +676,11 @@ router = Router(model_list: Optional[list] = None,

 ## Pre-Call Checks (Context Window)

-Enable pre-call checks to filter out deployments with context window limit < messages for a call.
+Enable pre-call checks to filter out:
+1. deployments with context window limit < messages for a call.
+2. deployments that have exceeded rate limits when making concurrent calls. (eg. `asyncio.gather(*[
+        router.acompletion(model="gpt-3.5-turbo", messages=m) for m in list_of_messages
+    ])`)

 <Tabs>
 <TabItem value="sdk" label="SDK">
--- a/docs/my-website/img/gcp_acc_1.png
+++ b/docs/my-website/img/gcp_acc_1.png
--- a/docs/my-website/img/gcp_acc_2.png
+++ b/docs/my-website/img/gcp_acc_2.png
--- a/docs/my-website/img/gcp_acc_3.png
+++ b/docs/my-website/img/gcp_acc_3.png
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -36,6 +36,7 @@ const sidebars = {
          label: "📖 All Endpoints (Swagger)",
          href: "https://litellm-api.up.railway.app/",
        },
+        "proxy/demo",
        "proxy/configs",
        "proxy/reliability",
        "proxy/users",
@ -85,6 +86,7 @@ const sidebars = {
        "completion/stream",
        "completion/message_trimming",
        "completion/function_call",
+        "completion/vision",
        "completion/model_alias",
        "completion/batching",
        "completion/mock_requests",
@ -114,6 +116,7 @@ const sidebars = {
      },
      items: [
        "providers/openai", 
+        "providers/text_completion_openai",
        "providers/openai_compatible",
        "providers/azure", 
        "providers/azure_ai", 
@ -163,7 +166,6 @@ const sidebars = {
        "debugging/local_debugging",
        "observability/callbacks",
        "observability/custom_callback",
-        "observability/lunary_integration",
        "observability/langfuse_integration",
        "observability/sentry",
        "observability/promptlayer_integration",
@ -171,6 +173,7 @@ const sidebars = {
        "observability/langsmith_integration",
        "observability/slack_integration",
        "observability/traceloop_integration",
+        "observability/lunary_integration",
        "observability/athina_integration",
        "observability/helicone_integration",
        "observability/supabase_integration",
--- a/enterprise/enterprise_hooks/llm_guard.py
+++ b/enterprise/enterprise_hooks/llm_guard.py
@ -95,7 +95,7 @@ class _ENTERPRISE_LLMGuard(CustomLogger):
            traceback.print_exc()
            raise e

-    def should_proceed(self, user_api_key_dict: UserAPIKeyAuth) -> bool:
+    def should_proceed(self, user_api_key_dict: UserAPIKeyAuth, data: dict) -> bool:
        if self.llm_guard_mode == "key-specific":
            # check if llm guard enabled for specific keys only
            self.print_verbose(
@ -108,6 +108,15 @@ class _ENTERPRISE_LLMGuard(CustomLogger):
                return True
        elif self.llm_guard_mode == "all":
            return True
+        elif self.llm_guard_mode == "request-specific":
+            self.print_verbose(f"received metadata: {data.get('metadata', {})}")
+            metadata = data.get("metadata", {})
+            permissions = metadata.get("permissions", {})
+            if (
+                "enable_llm_guard_check" in permissions
+                and permissions["enable_llm_guard_check"] == True
+            ):
+                return True
        return False

    async def async_moderation_hook(
@ -126,7 +135,7 @@ class _ENTERPRISE_LLMGuard(CustomLogger):
            f"Inside LLM Guard Pre-Call Hook - llm_guard_mode={self.llm_guard_mode}"
        )

-        _proceed = self.should_proceed(user_api_key_dict=user_api_key_dict)
+        _proceed = self.should_proceed(user_api_key_dict=user_api_key_dict, data=data)
        if _proceed == False:
            return

--- a/enterprise/utils.py
+++ b/enterprise/utils.py
@ -1,5 +1,6 @@
 # Enterprise Proxy Util Endpoints
 from litellm._logging import verbose_logger
+import collections


 async def get_spend_by_tags(start_date=None, end_date=None, prisma_client=None):
@ -17,6 +18,48 @@ async def get_spend_by_tags(start_date=None, end_date=None, prisma_client=None):
    return response


+async def ui_get_spend_by_tags(start_date=None, end_date=None, prisma_client=None):
+    response = await prisma_client.db.query_raw(
+        """
+        SELECT
+        jsonb_array_elements_text(request_tags) AS individual_request_tag,
+        DATE(s."startTime") AS spend_date,
+        COUNT(*) AS log_count,
+        SUM(spend) AS total_spend
+        FROM "LiteLLM_SpendLogs" s
+        WHERE s."startTime" >= current_date - interval '30 days'
+        GROUP BY individual_request_tag, spend_date
+        ORDER BY spend_date;
+        """
+    )
+
+    # print("tags - spend")
+    # print(response)
+    # Bar Chart 1 - Spend per tag - Top 10 tags by spend
+    total_spend_per_tag = collections.defaultdict(float)
+    total_requests_per_tag = collections.defaultdict(int)
+    for row in response:
+        tag_name = row["individual_request_tag"]
+        tag_spend = row["total_spend"]
+
+        total_spend_per_tag[tag_name] += tag_spend
+        total_requests_per_tag[tag_name] += row["log_count"]
+
+    sorted_tags = sorted(total_spend_per_tag.items(), key=lambda x: x[1], reverse=True)
+    # convert to ui format
+    ui_tags = []
+    for tag in sorted_tags:
+        ui_tags.append(
+            {
+                "name": tag[0],
+                "value": tag[1],
+                "log_count": total_requests_per_tag[tag[0]],
+            }
+        )
+
+    return {"top_10_tags": ui_tags}
+
+
 async def view_spend_logs_from_clickhouse(
    api_key=None, user_id=None, request_id=None, start_date=None, end_date=None
 ):
--- a/litellm/init.py
+++ b/litellm/init.py
@ -3,7 +3,11 @@ import threading, requests, os
 from typing import Callable, List, Optional, Dict, Union, Any, Literal
 from litellm.caching import Cache
 from litellm._logging import set_verbose, _turn_on_debug, verbose_logger
-from litellm.proxy._types import KeyManagementSystem, KeyManagementSettings
+from litellm.proxy._types import (
+    KeyManagementSystem,
+    KeyManagementSettings,
+    LiteLLM_UpperboundKeyGenerateParams,
+)
 import httpx
 import dotenv

@ -15,6 +19,7 @@ if set_verbose == True:
 input_callback: List[Union[str, Callable]] = []
 success_callback: List[Union[str, Callable]] = []
 failure_callback: List[Union[str, Callable]] = []
+service_callback: List[Union[str, Callable]] = []
 callbacks: List[Callable] = []
 _async_input_callback: List[Callable] = (
    []
@ -64,7 +69,7 @@ google_moderation_confidence_threshold: Optional[float] = None
 llamaguard_unsafe_content_categories: Optional[str] = None
 blocked_user_list: Optional[Union[str, List]] = None
 banned_keywords_list: Optional[Union[str, List]] = None
-llm_guard_mode: Literal["all", "key-specific"] = "all"
+llm_guard_mode: Literal["all", "key-specific", "request-specific"] = "all"
 ##################
 logging: bool = True
 caching: bool = (
@ -172,7 +177,7 @@ dynamodb_table_name: Optional[str] = None
 s3_callback_params: Optional[Dict] = None
 generic_logger_headers: Optional[Dict] = None
 default_key_generate_params: Optional[Dict] = None
-upperbound_key_generate_params: Optional[Dict] = None
+upperbound_key_generate_params: Optional[LiteLLM_UpperboundKeyGenerateParams] = None
 default_user_params: Optional[Dict] = None
 default_team_settings: Optional[List] = None
 max_user_budget: Optional[float] = None
@ -260,6 +265,7 @@ open_ai_chat_completion_models: List = []
 open_ai_text_completion_models: List = []
 cohere_models: List = []
 cohere_chat_models: List = []
+mistral_chat_models: List = []
 anthropic_models: List = []
 openrouter_models: List = []
 vertex_language_models: List = []
@ -269,6 +275,7 @@ vertex_code_chat_models: List = []
 vertex_text_models: List = []
 vertex_code_text_models: List = []
 vertex_embedding_models: List = []
+vertex_anthropic_models: List = []
 ai21_models: List = []
 nlp_cloud_models: List = []
 aleph_alpha_models: List = []
@ -284,6 +291,8 @@ for key, value in model_cost.items():
        cohere_models.append(key)
    elif value.get("litellm_provider") == "cohere_chat":
        cohere_chat_models.append(key)
+    elif value.get("litellm_provider") == "mistral":
+        mistral_chat_models.append(key)
    elif value.get("litellm_provider") == "anthropic":
        anthropic_models.append(key)
    elif value.get("litellm_provider") == "openrouter":
@ -302,6 +311,9 @@ for key, value in model_cost.items():
        vertex_code_chat_models.append(key)
    elif value.get("litellm_provider") == "vertex_ai-embedding-models":
        vertex_embedding_models.append(key)
+    elif value.get("litellm_provider") == "vertex_ai-anthropic_models":
+        key = key.replace("vertex_ai/", "")
+        vertex_anthropic_models.append(key)
    elif value.get("litellm_provider") == "ai21":
        ai21_models.append(key)
    elif value.get("litellm_provider") == "nlp_cloud":
@ -571,6 +583,7 @@ from .utils import (
    completion_cost,
    supports_function_calling,
    supports_parallel_function_calling,
+    supports_vision,
    get_litellm_params,
    Logging,
    acreate,
@ -588,6 +601,7 @@ from .utils import (
    _should_retry,
    get_secret,
    get_supported_openai_params,
+    get_api_base,
 )
 from .llms.huggingface_restapi import HuggingfaceConfig
 from .llms.anthropic import AnthropicConfig
@ -603,6 +617,7 @@ from .llms.nlp_cloud import NLPCloudConfig
 from .llms.aleph_alpha import AlephAlphaConfig
 from .llms.petals import PetalsConfig
 from .llms.vertex_ai import VertexAIConfig
+from .llms.vertex_ai_anthropic import VertexAIAnthropicConfig
 from .llms.sagemaker import SagemakerConfig
 from .llms.ollama import OllamaConfig
 from .llms.ollama_chat import OllamaChatConfig
--- a/litellm/_service_logger.py
+++ b/litellm/_service_logger.py
@ -0,0 +1,71 @@
+import litellm
+from .types.services import ServiceTypes, ServiceLoggerPayload
+from .integrations.prometheus_services import PrometheusServicesLogger
+
+
+class ServiceLogging:
+    """
+    Separate class used for monitoring health of litellm-adjacent services (redis/postgres).
+    """
+
+    def __init__(self, mock_testing: bool = False) -> None:
+        self.mock_testing = mock_testing
+        self.mock_testing_sync_success_hook = 0
+        self.mock_testing_async_success_hook = 0
+        self.mock_testing_sync_failure_hook = 0
+        self.mock_testing_async_failure_hook = 0
+
+        if "prometheus_system" in litellm.service_callback:
+            self.prometheusServicesLogger = PrometheusServicesLogger()
+
+    def service_success_hook(self, service: ServiceTypes, duration: float):
+        """
+        [TODO] Not implemented for sync calls yet. V0 is focused on async monitoring (used by proxy).
+        """
+        if self.mock_testing:
+            self.mock_testing_sync_success_hook += 1
+
+    def service_failure_hook(
+        self, service: ServiceTypes, duration: float, error: Exception
+    ):
+        """
+        [TODO] Not implemented for sync calls yet. V0 is focused on async monitoring (used by proxy).
+        """
+        if self.mock_testing:
+            self.mock_testing_sync_failure_hook += 1
+
+    async def async_service_success_hook(self, service: ServiceTypes, duration: float):
+        """
+        - For counting if the redis, postgres call is successful
+        """
+        if self.mock_testing:
+            self.mock_testing_async_success_hook += 1
+
+        payload = ServiceLoggerPayload(
+            is_error=False, error=None, service=service, duration=duration
+        )
+        for callback in litellm.service_callback:
+            if callback == "prometheus_system":
+                await self.prometheusServicesLogger.async_service_success_hook(
+                    payload=payload
+                )
+
+    async def async_service_failure_hook(
+        self, service: ServiceTypes, duration: float, error: Exception
+    ):
+        """
+        - For counting if the redis, postgres call is unsuccessful
+        """
+        if self.mock_testing:
+            self.mock_testing_async_failure_hook += 1
+
+        payload = ServiceLoggerPayload(
+            is_error=True, error=str(error), service=service, duration=duration
+        )
+        for callback in litellm.service_callback:
+            if callback == "prometheus_system":
+                if self.prometheusServicesLogger is None:
+                    self.prometheusServicesLogger = self.prometheusServicesLogger()
+                await self.prometheusServicesLogger.async_service_failure_hook(
+                    payload=payload
+                )
--- a/litellm/caching.py
+++ b/litellm/caching.py
@ -13,6 +13,8 @@ import json, traceback, ast, hashlib
 from typing import Optional, Literal, List, Union, Any, BinaryIO
 from openai._models import BaseModel as OpenAIObject
 from litellm._logging import verbose_logger
+from litellm._service_logger import ServiceLogging
+from litellm.types.services import ServiceLoggerPayload, ServiceTypes
 import traceback


@ -81,9 +83,30 @@ class InMemoryCache(BaseCache):
            return cached_response
        return None

+    def batch_get_cache(self, keys: list, **kwargs):
+        return_val = []
+        for k in keys:
+            val = self.get_cache(key=k, **kwargs)
+            return_val.append(val)
+        return return_val
+
    async def async_get_cache(self, key, **kwargs):
        return self.get_cache(key=key, **kwargs)

+    async def async_batch_get_cache(self, keys: list, **kwargs):
+        return_val = []
+        for k in keys:
+            val = self.get_cache(key=k, **kwargs)
+            return_val.append(val)
+        return return_val
+
+    async def async_increment(self, key, value: int, **kwargs) -> int:
+        # get the value
+        init_value = await self.async_get_cache(key=key) or 0
+        value = init_value + value
+        await self.async_set_cache(key, value, **kwargs)
+        return value
+
    def flush_cache(self):
        self.cache_dict.clear()
        self.ttl_dict.clear()
@ -142,6 +165,9 @@ class RedisCache(BaseCache):
        except Exception as e:
            pass

+        ### HEALTH MONITORING OBJECT ###
+        self.service_logger_obj = ServiceLogging()
+
    def init_async_client(self):
        from ._redis import get_redis_async_client

@ -173,17 +199,59 @@ class RedisCache(BaseCache):
            )

    async def async_scan_iter(self, pattern: str, count: int = 100) -> list:
+        start_time = time.time()
+        try:
            keys = []
            _redis_client = self.init_async_client()
            async with _redis_client as redis_client:
-            async for key in redis_client.scan_iter(match=pattern + "*", count=count):
+                async for key in redis_client.scan_iter(
+                    match=pattern + "*", count=count
+                ):
                    keys.append(key)
                    if len(keys) >= count:
                        break
+
+                ## LOGGING ##
+                end_time = time.time()
+                _duration = end_time - start_time
+                asyncio.create_task(
+                    self.service_logger_obj.async_service_success_hook(
+                        service=ServiceTypes.REDIS, duration=_duration
+                    )
+                )  # DO NOT SLOW DOWN CALL B/C OF THIS
            return keys
+        except Exception as e:
+            # NON blocking - notify users Redis is throwing an exception
+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_failure_hook(
+                    service=ServiceTypes.REDIS, duration=_duration, error=e
+                )
+            )
+            raise e

    async def async_set_cache(self, key, value, **kwargs):
+        start_time = time.time()
+        try:
            _redis_client = self.init_async_client()
+        except Exception as e:
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_failure_hook(
+                    service=ServiceTypes.REDIS, duration=_duration, error=e
+                )
+            )
+            # NON blocking - notify users Redis is throwing an exception
+            verbose_logger.error(
+                "LiteLLM Redis Caching: async set() - Got exception from REDIS %s, Writing value=%s",
+                str(e),
+                value,
+            )
+            traceback.print_exc()
+
        key = self.check_and_fix_namespace(key=key)
        async with _redis_client as redis_client:
            ttl = kwargs.get("ttl", None)
@ -195,7 +263,21 @@ class RedisCache(BaseCache):
                print_verbose(
                    f"Successfully Set ASYNC Redis Cache: key: {key}\nValue {value}\nttl={ttl}"
                )
+                end_time = time.time()
+                _duration = end_time - start_time
+                asyncio.create_task(
+                    self.service_logger_obj.async_service_success_hook(
+                        service=ServiceTypes.REDIS, duration=_duration
+                    )
+                )
            except Exception as e:
+                end_time = time.time()
+                _duration = end_time - start_time
+                asyncio.create_task(
+                    self.service_logger_obj.async_service_failure_hook(
+                        service=ServiceTypes.REDIS, duration=_duration, error=e
+                    )
+                )
                # NON blocking - notify users Redis is throwing an exception
                verbose_logger.error(
                    "LiteLLM Redis Caching: async set() - Got exception from REDIS %s, Writing value=%s",
@ -209,6 +291,7 @@ class RedisCache(BaseCache):
        Use Redis Pipelines for bulk write operations
        """
        _redis_client = self.init_async_client()
+        start_time = time.time()
        try:
            async with _redis_client as redis_client:
                async with redis_client.pipeline(transaction=True) as pipe:
@ -228,8 +311,25 @@ class RedisCache(BaseCache):

            print_verbose(f"pipeline results: {results}")
            # Optionally, you could process 'results' to make sure that all set operations were successful.
+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_success_hook(
+                    service=ServiceTypes.REDIS, duration=_duration
+                )
+            )
            return results
        except Exception as e:
+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_failure_hook(
+                    service=ServiceTypes.REDIS, duration=_duration, error=e
+                )
+            )
+
            verbose_logger.error(
                "LiteLLM Redis Caching: async set_cache_pipeline() - Got exception from REDIS %s, Writing value=%s",
                str(e),
@ -244,7 +344,40 @@ class RedisCache(BaseCache):
        key = self.check_and_fix_namespace(key=key)
        self.redis_batch_writing_buffer.append((key, value))
        if len(self.redis_batch_writing_buffer) >= self.redis_flush_size:
-            await self.flush_cache_buffer()
+            await self.flush_cache_buffer()  # logging done in here
+
+    async def async_increment(self, key, value: int, **kwargs) -> int:
+        _redis_client = self.init_async_client()
+        start_time = time.time()
+        try:
+            async with _redis_client as redis_client:
+                result = await redis_client.incr(name=key, amount=value)
+                ## LOGGING ##
+                end_time = time.time()
+                _duration = end_time - start_time
+                asyncio.create_task(
+                    self.service_logger_obj.async_service_success_hook(
+                        service=ServiceTypes.REDIS,
+                        duration=_duration,
+                    )
+                )
+                return result
+        except Exception as e:
+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_failure_hook(
+                    service=ServiceTypes.REDIS, duration=_duration, error=e
+                )
+            )
+            verbose_logger.error(
+                "LiteLLM Redis Caching: async async_increment() - Got exception from REDIS %s, Writing value=%s",
+                str(e),
+                value,
+            )
+            traceback.print_exc()
+            raise e

    async def flush_cache_buffer(self):
        print_verbose(
@ -283,40 +416,17 @@ class RedisCache(BaseCache):
            traceback.print_exc()
            logging.debug("LiteLLM Caching: get() - Got exception from REDIS: ", e)

-    async def async_get_cache(self, key, **kwargs):
-        _redis_client = self.init_async_client()
-        key = self.check_and_fix_namespace(key=key)
-        async with _redis_client as redis_client:
-            try:
-                print_verbose(f"Get Async Redis Cache: key: {key}")
-                cached_response = await redis_client.get(key)
-                print_verbose(
-                    f"Got Async Redis Cache: key: {key}, cached_response {cached_response}"
-                )
-                response = self._get_cache_logic(cached_response=cached_response)
-                return response
-            except Exception as e:
-                # NON blocking - notify users Redis is throwing an exception
-                print_verbose(
-                    f"LiteLLM Caching: async get() - Got exception from REDIS: {str(e)}"
-                )
-
-    async def async_get_cache_pipeline(self, key_list) -> dict:
+    def batch_get_cache(self, key_list) -> dict:
        """
        Use Redis for bulk read operations
        """
-        _redis_client = await self.init_async_client()
        key_value_dict = {}
        try:
-            async with _redis_client as redis_client:
-                async with redis_client.pipeline(transaction=True) as pipe:
-                    # Queue the get operations in the pipeline for all keys.
+            _keys = []
            for cache_key in key_list:
                cache_key = self.check_and_fix_namespace(key=cache_key)
-                        pipe.get(cache_key)  # Queue GET command in pipeline
-
-                    # Execute the pipeline and await the results.
-                    results = await pipe.execute()
+                _keys.append(cache_key)
+            results = self.redis_client.mget(keys=_keys)

            # Associate the results back with their keys.
            # 'results' is a list of values corresponding to the order of keys in 'key_list'.
@ -332,6 +442,89 @@ class RedisCache(BaseCache):
            print_verbose(f"Error occurred in pipeline read - {str(e)}")
            return key_value_dict

+    async def async_get_cache(self, key, **kwargs):
+        _redis_client = self.init_async_client()
+        key = self.check_and_fix_namespace(key=key)
+        start_time = time.time()
+        async with _redis_client as redis_client:
+            try:
+                print_verbose(f"Get Async Redis Cache: key: {key}")
+                cached_response = await redis_client.get(key)
+                print_verbose(
+                    f"Got Async Redis Cache: key: {key}, cached_response {cached_response}"
+                )
+                response = self._get_cache_logic(cached_response=cached_response)
+                ## LOGGING ##
+                end_time = time.time()
+                _duration = end_time - start_time
+                asyncio.create_task(
+                    self.service_logger_obj.async_service_success_hook(
+                        service=ServiceTypes.REDIS, duration=_duration
+                    )
+                )
+                return response
+            except Exception as e:
+                ## LOGGING ##
+                end_time = time.time()
+                _duration = end_time - start_time
+                asyncio.create_task(
+                    self.service_logger_obj.async_service_failure_hook(
+                        service=ServiceTypes.REDIS, duration=_duration, error=e
+                    )
+                )
+                # NON blocking - notify users Redis is throwing an exception
+                print_verbose(
+                    f"LiteLLM Caching: async get() - Got exception from REDIS: {str(e)}"
+                )
+
+    async def async_batch_get_cache(self, key_list) -> dict:
+        """
+        Use Redis for bulk read operations
+        """
+        _redis_client = await self.init_async_client()
+        key_value_dict = {}
+        start_time = time.time()
+        try:
+            async with _redis_client as redis_client:
+                _keys = []
+                for cache_key in key_list:
+                    cache_key = self.check_and_fix_namespace(key=cache_key)
+                    _keys.append(cache_key)
+                results = await redis_client.mget(keys=_keys)
+
+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_success_hook(
+                    service=ServiceTypes.REDIS, duration=_duration
+                )
+            )
+
+            # Associate the results back with their keys.
+            # 'results' is a list of values corresponding to the order of keys in 'key_list'.
+            key_value_dict = dict(zip(key_list, results))
+
+            decoded_results = {}
+            for k, v in key_value_dict.items():
+                if isinstance(k, bytes):
+                    k = k.decode("utf-8")
+                v = self._get_cache_logic(v)
+                decoded_results[k] = v
+
+            return decoded_results
+        except Exception as e:
+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_failure_hook(
+                    service=ServiceTypes.REDIS, duration=_duration, error=e
+                )
+            )
+            print_verbose(f"Error occurred in pipeline read - {str(e)}")
+            return key_value_dict
+
    async def ping(self):
        _redis_client = self.init_async_client()
        async with _redis_client as redis_client:
@ -897,6 +1090,39 @@ class DualCache(BaseCache):
        except Exception as e:
            traceback.print_exc()

+    def batch_get_cache(self, keys: list, local_only: bool = False, **kwargs):
+        try:
+            result = [None for _ in range(len(keys))]
+            if self.in_memory_cache is not None:
+                in_memory_result = self.in_memory_cache.batch_get_cache(keys, **kwargs)
+
+                print_verbose(f"in_memory_result: {in_memory_result}")
+                if in_memory_result is not None:
+                    result = in_memory_result
+
+            if None in result and self.redis_cache is not None and local_only == False:
+                """
+                - for the none values in the result
+                - check the redis cache
+                """
+                sublist_keys = [
+                    key for key, value in zip(keys, result) if value is None
+                ]
+                # If not found in in-memory cache, try fetching from Redis
+                redis_result = self.redis_cache.batch_get_cache(sublist_keys, **kwargs)
+                if redis_result is not None:
+                    # Update in-memory cache with the value from Redis
+                    for key in redis_result:
+                        self.in_memory_cache.set_cache(key, redis_result[key], **kwargs)
+
+                for key, value in redis_result.items():
+                    result[sublist_keys.index(key)] = value
+
+            print_verbose(f"async batch get cache: cache result: {result}")
+            return result
+        except Exception as e:
+            traceback.print_exc()
+
    async def async_get_cache(self, key, local_only: bool = False, **kwargs):
        # Try to fetch from in-memory cache first
        try:
@ -930,6 +1156,50 @@ class DualCache(BaseCache):
        except Exception as e:
            traceback.print_exc()

+    async def async_batch_get_cache(
+        self, keys: list, local_only: bool = False, **kwargs
+    ):
+        try:
+            result = [None for _ in range(len(keys))]
+            if self.in_memory_cache is not None:
+                in_memory_result = await self.in_memory_cache.async_batch_get_cache(
+                    keys, **kwargs
+                )
+
+                print_verbose(f"in_memory_result: {in_memory_result}")
+                if in_memory_result is not None:
+                    result = in_memory_result
+
+            if None in result and self.redis_cache is not None and local_only == False:
+                """
+                - for the none values in the result
+                - check the redis cache
+                """
+                sublist_keys = [
+                    key for key, value in zip(keys, result) if value is None
+                ]
+                # If not found in in-memory cache, try fetching from Redis
+                redis_result = await self.redis_cache.async_batch_get_cache(
+                    sublist_keys, **kwargs
+                )
+
+                if redis_result is not None:
+                    # Update in-memory cache with the value from Redis
+                    for key in redis_result:
+                        await self.in_memory_cache.async_set_cache(
+                            key, redis_result[key], **kwargs
+                        )
+
+                sublist_dict = dict(zip(sublist_keys, redis_result))
+
+                for key, value in sublist_dict.items():
+                    result[sublist_keys.index(key)] = value
+
+            print_verbose(f"async batch get cache: cache result: {result}")
+            return result
+        except Exception as e:
+            traceback.print_exc()
+
    async def async_set_cache(self, key, value, local_only: bool = False, **kwargs):
        try:
            if self.in_memory_cache is not None:
@ -941,6 +1211,32 @@ class DualCache(BaseCache):
            print_verbose(f"LiteLLM Cache: Excepton async add_cache: {str(e)}")
            traceback.print_exc()

+    async def async_increment_cache(
+        self, key, value: int, local_only: bool = False, **kwargs
+    ) -> int:
+        """
+        Key - the key in cache
+
+        Value - int - the value you want to increment by
+
+        Returns - int - the incremented value
+        """
+        try:
+            result: int = value
+            if self.in_memory_cache is not None:
+                result = await self.in_memory_cache.async_increment(
+                    key, value, **kwargs
+                )
+
+            if self.redis_cache is not None and local_only == False:
+                result = await self.redis_cache.async_increment(key, value, **kwargs)
+
+            return result
+        except Exception as e:
+            print_verbose(f"LiteLLM Cache: Excepton async add_cache: {str(e)}")
+            traceback.print_exc()
+            raise e
+
    def flush_cache(self):
        if self.in_memory_cache is not None:
            self.in_memory_cache.flush_cache()
--- a/litellm/integrations/langfuse.py
+++ b/litellm/integrations/langfuse.py
@ -161,7 +161,7 @@ class LangFuseLogger:
            verbose_logger.info(f"Langfuse Layer Logging - logging success")
        except:
            traceback.print_exc()
-            print(f"Langfuse Layer Error - {traceback.format_exc()}")
+            verbose_logger.debug(f"Langfuse Layer Error - {traceback.format_exc()}")
            pass

    async def _async_log_event(
@ -190,7 +190,7 @@ class LangFuseLogger:
    ):
        from langfuse.model import CreateTrace, CreateGeneration

-        print(
+        verbose_logger.warning(
            "Please upgrade langfuse to v2.0.0 or higher: https://github.com/langfuse/langfuse-python/releases/tag/v2.0.1"
        )

@ -247,7 +247,6 @@ class LangFuseLogger:

            print_verbose(f"Langfuse Layer Logging - logging to langfuse v2 ")

-            print(f"response_obj: {response_obj}")
            if supports_tags:
                metadata_tags = metadata.get("tags", [])
                tags = metadata_tags
@ -306,31 +305,35 @@ class LangFuseLogger:
                    tags.append(f"cache_hit:{kwargs['cache_hit']}")
                trace_params.update({"tags": tags})

+            print_verbose(f"trace_params: {trace_params}")
+
            trace = self.Langfuse.trace(**trace_params)

            generation_id = None
            usage = None
            if response_obj is not None and response_obj.get("id", None) is not None:
                generation_id = litellm.utils.get_logging_id(start_time, response_obj)
-                print(f"getting usage, cost={cost}")
                usage = {
                    "prompt_tokens": response_obj["usage"]["prompt_tokens"],
                    "completion_tokens": response_obj["usage"]["completion_tokens"],
                    "total_cost": cost if supports_costs else None,
                }
-                print(f"constructed usage - {usage}")
            generation_name = metadata.get("generation_name", None)
            if generation_name is None:
                # just log `litellm-{call_type}` as the generation name
                generation_name = f"litellm-{kwargs.get('call_type', 'completion')}"

+            system_fingerprint = response_obj.get("system_fingerprint", None)
+            if system_fingerprint is not None:
+                optional_params["system_fingerprint"] = system_fingerprint
+
            generation_params = {
                "name": generation_name,
                "id": metadata.get("generation_id", generation_id),
-                "startTime": start_time,
-                "endTime": end_time,
+                "start_time": start_time,
+                "end_time": end_time,
                "model": kwargs["model"],
-                "modelParameters": optional_params,
+                "model_parameters": optional_params,
                "input": input,
                "output": output,
                "usage": usage,
@ -342,13 +345,15 @@ class LangFuseLogger:
                generation_params["prompt"] = metadata.get("prompt", None)

            if output is not None and isinstance(output, str) and level == "ERROR":
-                generation_params["statusMessage"] = output
+                generation_params["status_message"] = output

            if supports_completion_start_time:
                generation_params["completion_start_time"] = kwargs.get(
                    "completion_start_time", None
                )

+            print_verbose(f"generation_params: {generation_params}")
+
            trace.generation(**generation_params)
        except Exception as e:
-            print(f"Langfuse Layer Error - {traceback.format_exc()}")
+            verbose_logger.debug(f"Langfuse Layer Error - {traceback.format_exc()}")
--- a/litellm/integrations/langsmith.py
+++ b/litellm/integrations/langsmith.py
@ -53,6 +53,8 @@ class LangsmithLogger:
                value = kwargs[key]
                if key == "start_time" or key == "end_time":
                    pass
+                elif type(value) == datetime.datetime:
+                    new_kwargs[key] = value.isoformat()
                elif type(value) != dict:
                    new_kwargs[key] = value

--- a/litellm/integrations/lunary.py
+++ b/litellm/integrations/lunary.py
@ -6,6 +6,8 @@ import dotenv
 import importlib
 import sys

+import packaging
+
 dotenv.load_dotenv()


@ -56,13 +58,12 @@ class LunaryLogger:
    def __init__(self):
        try:
            import lunary
-            from pkg_resources import parse_version

            version = importlib.metadata.version("lunary")
            # if version < 0.1.43 then raise ImportError
-            if parse_version(version) < parse_version("0.1.43"):
+            if packaging.version.Version(version) < packaging.version.Version("0.1.43"):
                print(
-                    "Lunary version outdated. Required: > 0.1.43. Upgrade via 'pip install lunary --upgrade'"
+                    "Lunary version outdated. Required: >= 0.1.43. Upgrade via 'pip install lunary --upgrade'"
                )
                raise ImportError

--- a/litellm/integrations/prometheus.py
+++ b/litellm/integrations/prometheus.py
@ -1,6 +1,6 @@
 # used for /metrics endpoint on LiteLLM Proxy
 #### What this does ####
-#    On success + failure, log events to Supabase
+#    On success, log events to Prometheus

 import dotenv, os
 import requests
--- a/litellm/integrations/prometheus_services.py
+++ b/litellm/integrations/prometheus_services.py
@ -0,0 +1,177 @@
+# used for monitoring litellm services health on `/metrics` endpoint on LiteLLM Proxy
+#### What this does ####
+#    On success + failure, log events to Prometheus for litellm / adjacent services (litellm, redis, postgres, llm api providers)
+
+
+import dotenv, os
+import requests
+
+dotenv.load_dotenv()  # Loading env variables using dotenv
+import traceback
+import datetime, subprocess, sys
+import litellm, uuid
+from litellm._logging import print_verbose, verbose_logger
+from litellm.types.services import ServiceLoggerPayload, ServiceTypes
+
+
+class PrometheusServicesLogger:
+    # Class variables or attributes
+    litellm_service_latency = None  # Class-level attribute to store the Histogram
+
+    def __init__(
+        self,
+        mock_testing: bool = False,
+        **kwargs,
+    ):
+        try:
+            try:
+                from prometheus_client import Counter, Histogram, REGISTRY
+            except ImportError:
+                raise Exception(
+                    "Missing prometheus_client. Run `pip install prometheus-client`"
+                )
+
+            self.Histogram = Histogram
+            self.Counter = Counter
+            self.REGISTRY = REGISTRY
+
+            verbose_logger.debug(f"in init prometheus services metrics")
+
+            self.services = [item.value for item in ServiceTypes]
+
+            self.payload_to_prometheus_map = (
+                {}
+            )  # store the prometheus histogram/counter we need to call for each field in payload
+
+            for service in self.services:
+                histogram = self.create_histogram(service)
+                counter = self.create_counter(service)
+                self.payload_to_prometheus_map[service] = [histogram, counter]
+
+            self.prometheus_to_amount_map: dict = (
+                {}
+            )  # the field / value in ServiceLoggerPayload the object needs to be incremented by
+
+            ### MOCK TESTING ###
+            self.mock_testing = mock_testing
+            self.mock_testing_success_calls = 0
+            self.mock_testing_failure_calls = 0
+
+        except Exception as e:
+            print_verbose(f"Got exception on init prometheus client {str(e)}")
+            raise e
+
+    def is_metric_registered(self, metric_name) -> bool:
+        for metric in self.REGISTRY.collect():
+            if metric_name == metric.name:
+                return True
+        return False
+
+    def get_metric(self, metric_name):
+        for metric in self.REGISTRY.collect():
+            for sample in metric.samples:
+                if metric_name == sample.name:
+                    return metric
+        return None
+
+    def create_histogram(self, label: str):
+        metric_name = "litellm_{}_latency".format(label)
+        is_registered = self.is_metric_registered(metric_name)
+        if is_registered:
+            return self.get_metric(metric_name)
+        return self.Histogram(
+            metric_name,
+            "Latency for {} service".format(label),
+            labelnames=[label],
+        )
+
+    def create_counter(self, label: str):
+        metric_name = "litellm_{}_failed_requests".format(label)
+        is_registered = self.is_metric_registered(metric_name)
+        if is_registered:
+            return self.get_metric(metric_name)
+        return self.Counter(
+            metric_name,
+            "Total failed requests for {} service".format(label),
+            labelnames=[label],
+        )
+
+    def observe_histogram(
+        self,
+        histogram,
+        labels: str,
+        amount: float,
+    ):
+        assert isinstance(histogram, self.Histogram)
+
+        histogram.labels(labels).observe(amount)
+
+    def increment_counter(
+        self,
+        counter,
+        labels: str,
+        amount: float,
+    ):
+        assert isinstance(counter, self.Counter)
+
+        counter.labels(labels).inc(amount)
+
+    def service_success_hook(self, payload: ServiceLoggerPayload):
+        if self.mock_testing:
+            self.mock_testing_success_calls += 1
+
+        if payload.service.value in self.payload_to_prometheus_map:
+            prom_objects = self.payload_to_prometheus_map[payload.service.value]
+            for obj in prom_objects:
+                if isinstance(obj, self.Histogram):
+                    self.observe_histogram(
+                        histogram=obj,
+                        labels=payload.service.value,
+                        amount=payload.duration,
+                    )
+
+    def service_failure_hook(self, payload: ServiceLoggerPayload):
+        if self.mock_testing:
+            self.mock_testing_failure_calls += 1
+
+        if payload.service.value in self.payload_to_prometheus_map:
+            prom_objects = self.payload_to_prometheus_map[payload.service.value]
+            for obj in prom_objects:
+                if isinstance(obj, self.Counter):
+                    self.increment_counter(
+                        counter=obj,
+                        labels=payload.service.value,
+                        amount=1,  # LOG ERROR COUNT TO PROMETHEUS
+                    )
+
+    async def async_service_success_hook(self, payload: ServiceLoggerPayload):
+        """
+        Log successful call to prometheus
+        """
+        if self.mock_testing:
+            self.mock_testing_success_calls += 1
+
+        if payload.service.value in self.payload_to_prometheus_map:
+            prom_objects = self.payload_to_prometheus_map[payload.service.value]
+            for obj in prom_objects:
+                if isinstance(obj, self.Histogram):
+                    self.observe_histogram(
+                        histogram=obj,
+                        labels=payload.service.value,
+                        amount=payload.duration,
+                    )
+
+    async def async_service_failure_hook(self, payload: ServiceLoggerPayload):
+        print(f"received error payload: {payload.error}")
+        if self.mock_testing:
+            self.mock_testing_failure_calls += 1
+
+        if payload.service.value in self.payload_to_prometheus_map:
+            prom_objects = self.payload_to_prometheus_map[payload.service.value]
+            for obj in prom_objects:
+                if isinstance(obj, self.Counter):
+                    self.increment_counter(
+                        counter=obj,
+                        labels=payload.service.value,
+                        amount=1,  # LOG ERROR COUNT TO PROMETHEUS
+                    )
--- a/litellm/llms/anthropic.py
+++ b/litellm/llms/anthropic.py
@ -2,18 +2,13 @@ import os, types
 import json
 from enum import Enum
 import requests, copy
-import time, uuid
+import time
 from typing import Callable, Optional, List
 from litellm.utils import ModelResponse, Usage, map_finish_reason, CustomStreamWrapper
 import litellm
-from .prompt_templates.factory import (
-    contains_tag,
-    prompt_factory,
-    custom_prompt,
-    construct_tool_use_system_prompt,
-    extract_between_tags,
-    parse_xml_params,
-)
+from .prompt_templates.factory import prompt_factory, custom_prompt
+from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
+from .base import BaseLLM
 import httpx


@ -21,6 +16,8 @@ class AnthropicConstants(Enum):
    HUMAN_PROMPT = "\n\nHuman: "
    AI_PROMPT = "\n\nAssistant: "

+    # constants from https://github.com/anthropics/anthropic-sdk-python/blob/main/src/anthropic/_constants.py
+

 class AnthropicError(Exception):
    def __init__(self, status_code, message):
@ -37,12 +34,14 @@ class AnthropicError(Exception):

 class AnthropicConfig:
    """
-    Reference: https://docs.anthropic.com/claude/reference/complete_post
+    Reference: https://docs.anthropic.com/claude/reference/messages_post

    to pass metadata to anthropic, it's {"user_id": "any-relevant-information"}
    """

-    max_tokens: Optional[int] = litellm.max_tokens  # anthropic requires a default
+    max_tokens: Optional[int] = (
+        4096  # anthropic requires a default value (Opus, Sonnet, and Haiku have the same default)
+    )
    stop_sequences: Optional[list] = None
    temperature: Optional[int] = None
    top_p: Optional[int] = None
@ -52,7 +51,9 @@ class AnthropicConfig:

    def __init__(
        self,
-        max_tokens: Optional[int] = 256,  # anthropic requires a default
+        max_tokens: Optional[
+            int
+        ] = 4096,  # You can pass in a value yourself or use the default value 4096
        stop_sequences: Optional[list] = None,
        temperature: Optional[int] = None,
        top_p: Optional[int] = None,
@ -101,7 +102,221 @@ def validate_environment(api_key, user_headers):
    return headers


-def completion(
+class AnthropicChatCompletion(BaseLLM):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def process_response(
+        self,
+        model,
+        response,
+        model_response,
+        _is_function_call,
+        stream,
+        logging_obj,
+        api_key,
+        data,
+        messages,
+        print_verbose,
+    ):
+        ## LOGGING
+        logging_obj.post_call(
+            input=messages,
+            api_key=api_key,
+            original_response=response.text,
+            additional_args={"complete_input_dict": data},
+        )
+        print_verbose(f"raw model_response: {response.text}")
+        ## RESPONSE OBJECT
+        try:
+            completion_response = response.json()
+        except:
+            raise AnthropicError(
+                message=response.text, status_code=response.status_code
+            )
+        if "error" in completion_response:
+            raise AnthropicError(
+                message=str(completion_response["error"]),
+                status_code=response.status_code,
+            )
+        elif len(completion_response["content"]) == 0:
+            raise AnthropicError(
+                message="No content in response",
+                status_code=response.status_code,
+            )
+        else:
+            text_content = ""
+            tool_calls = []
+            for content in completion_response["content"]:
+                if content["type"] == "text":
+                    text_content += content["text"]
+                ## TOOL CALLING
+                elif content["type"] == "tool_use":
+                    tool_calls.append(
+                        {
+                            "id": content["id"],
+                            "type": "function",
+                            "function": {
+                                "name": content["name"],
+                                "arguments": json.dumps(content["input"]),
+                            },
+                        }
+                    )
+
+            _message = litellm.Message(
+                tool_calls=tool_calls,
+                content=text_content or None,
+            )
+            model_response.choices[0].message = _message  # type: ignore
+            model_response._hidden_params["original_response"] = completion_response[
+                "content"
+            ]  # allow user to access raw anthropic tool calling response
+
+            model_response.choices[0].finish_reason = map_finish_reason(
+                completion_response["stop_reason"]
+            )
+
+        print_verbose(f"_is_function_call: {_is_function_call}; stream: {stream}")
+        if _is_function_call and stream:
+            print_verbose("INSIDE ANTHROPIC STREAMING TOOL CALLING CONDITION BLOCK")
+            # return an iterator
+            streaming_model_response = ModelResponse(stream=True)
+            streaming_model_response.choices[0].finish_reason = model_response.choices[
+                0
+            ].finish_reason
+            # streaming_model_response.choices = [litellm.utils.StreamingChoices()]
+            streaming_choice = litellm.utils.StreamingChoices()
+            streaming_choice.index = model_response.choices[0].index
+            _tool_calls = []
+            print_verbose(
+                f"type of model_response.choices[0]: {type(model_response.choices[0])}"
+            )
+            print_verbose(f"type of streaming_choice: {type(streaming_choice)}")
+            if isinstance(model_response.choices[0], litellm.Choices):
+                if getattr(
+                    model_response.choices[0].message, "tool_calls", None
+                ) is not None and isinstance(
+                    model_response.choices[0].message.tool_calls, list
+                ):
+                    for tool_call in model_response.choices[0].message.tool_calls:
+                        _tool_call = {**tool_call.dict(), "index": 0}
+                        _tool_calls.append(_tool_call)
+                delta_obj = litellm.utils.Delta(
+                    content=getattr(model_response.choices[0].message, "content", None),
+                    role=model_response.choices[0].message.role,
+                    tool_calls=_tool_calls,
+                )
+                streaming_choice.delta = delta_obj
+                streaming_model_response.choices = [streaming_choice]
+                completion_stream = ModelResponseIterator(
+                    model_response=streaming_model_response
+                )
+                print_verbose(
+                    "Returns anthropic CustomStreamWrapper with 'cached_response' streaming object"
+                )
+                return CustomStreamWrapper(
+                    completion_stream=completion_stream,
+                    model=model,
+                    custom_llm_provider="cached_response",
+                    logging_obj=logging_obj,
+                )
+
+        ## CALCULATING USAGE
+        prompt_tokens = completion_response["usage"]["input_tokens"]
+        completion_tokens = completion_response["usage"]["output_tokens"]
+        total_tokens = prompt_tokens + completion_tokens
+
+        model_response["created"] = int(time.time())
+        model_response["model"] = model
+        usage = Usage(
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=total_tokens,
+        )
+        model_response.usage = usage
+        return model_response
+
+    async def acompletion_stream_function(
+        self,
+        model: str,
+        messages: list,
+        api_base: str,
+        custom_prompt_dict: dict,
+        model_response: ModelResponse,
+        print_verbose: Callable,
+        encoding,
+        api_key,
+        logging_obj,
+        stream,
+        _is_function_call,
+        data=None,
+        optional_params=None,
+        litellm_params=None,
+        logger_fn=None,
+        headers={},
+    ):
+        self.async_handler = AsyncHTTPHandler(
+            timeout=httpx.Timeout(timeout=600.0, connect=5.0)
+        )
+        response = await self.async_handler.post(
+            api_base, headers=headers, data=json.dumps(data)
+        )
+
+        if response.status_code != 200:
+            raise AnthropicError(
+                status_code=response.status_code, message=response.text
+            )
+
+        completion_stream = response.aiter_lines()
+
+        streamwrapper = CustomStreamWrapper(
+            completion_stream=completion_stream,
+            model=model,
+            custom_llm_provider="anthropic",
+            logging_obj=logging_obj,
+        )
+        return streamwrapper
+
+    async def acompletion_function(
+        self,
+        model: str,
+        messages: list,
+        api_base: str,
+        custom_prompt_dict: dict,
+        model_response: ModelResponse,
+        print_verbose: Callable,
+        encoding,
+        api_key,
+        logging_obj,
+        stream,
+        _is_function_call,
+        data=None,
+        optional_params=None,
+        litellm_params=None,
+        logger_fn=None,
+        headers={},
+    ):
+        self.async_handler = AsyncHTTPHandler(
+            timeout=httpx.Timeout(timeout=600.0, connect=5.0)
+        )
+        response = await self.async_handler.post(
+            api_base, headers=headers, data=json.dumps(data)
+        )
+        return self.process_response(
+            model=model,
+            response=response,
+            model_response=model_response,
+            _is_function_call=_is_function_call,
+            stream=stream,
+            logging_obj=logging_obj,
+            api_key=api_key,
+            data=data,
+            messages=messages,
+            print_verbose=print_verbose,
+        )
+
+    def completion(
+        self,
        model: str,
        messages: list,
        api_base: str,
@ -112,13 +327,13 @@ def completion(
        api_key,
        logging_obj,
        optional_params=None,
+        acompletion=None,
        litellm_params=None,
        logger_fn=None,
        headers={},
-):
+    ):
        headers = validate_environment(api_key, headers)
        _is_function_call = False
-    json_schemas: dict = {}
        messages = copy.deepcopy(messages)
        optional_params = copy.deepcopy(optional_params)
        if model in custom_prompt_dict:
@ -162,17 +377,15 @@ def completion(
        ## Handle Tool Calling
        if "tools" in optional_params:
            _is_function_call = True
+            headers["anthropic-beta"] = "tools-2024-04-04"
+
+            anthropic_tools = []
            for tool in optional_params["tools"]:
-            json_schemas[tool["function"]["name"]] = tool["function"].get(
-                "parameters", None
-            )
-        tool_calling_system_prompt = construct_tool_use_system_prompt(
-            tools=optional_params["tools"]
-        )
-        optional_params["system"] = (
-            optional_params.get("system", "\n") + tool_calling_system_prompt
-        )  # add the anthropic tool calling prompt to the system prompt
-        optional_params.pop("tools")
+                new_tool = tool["function"]
+                new_tool["input_schema"] = new_tool.pop("parameters")  # rename key
+                anthropic_tools.append(new_tool)
+
+            optional_params["tools"] = anthropic_tools

        stream = optional_params.pop("stream", None)

@ -193,11 +406,55 @@ def completion(
            },
        )
        print_verbose(f"_is_function_call: {_is_function_call}")
+        if acompletion == True:
+            if (
+                stream and not _is_function_call
+            ):  # if function call - fake the streaming (need complete blocks for output parsing in openai format)
+                print_verbose("makes async anthropic streaming POST request")
+                data["stream"] = stream
+                return self.acompletion_stream_function(
+                    model=model,
+                    messages=messages,
+                    data=data,
+                    api_base=api_base,
+                    custom_prompt_dict=custom_prompt_dict,
+                    model_response=model_response,
+                    print_verbose=print_verbose,
+                    encoding=encoding,
+                    api_key=api_key,
+                    logging_obj=logging_obj,
+                    optional_params=optional_params,
+                    stream=stream,
+                    _is_function_call=_is_function_call,
+                    litellm_params=litellm_params,
+                    logger_fn=logger_fn,
+                    headers=headers,
+                )
+            else:
+                return self.acompletion_function(
+                    model=model,
+                    messages=messages,
+                    data=data,
+                    api_base=api_base,
+                    custom_prompt_dict=custom_prompt_dict,
+                    model_response=model_response,
+                    print_verbose=print_verbose,
+                    encoding=encoding,
+                    api_key=api_key,
+                    logging_obj=logging_obj,
+                    optional_params=optional_params,
+                    stream=stream,
+                    _is_function_call=_is_function_call,
+                    litellm_params=litellm_params,
+                    logger_fn=logger_fn,
+                    headers=headers,
+                )
+        else:
            ## COMPLETION CALL
            if (
-        stream is not None and stream == True and _is_function_call == False
+                stream and not _is_function_call
            ):  # if function call - fake the streaming (need complete blocks for output parsing in openai format)
-        print_verbose(f"makes anthropic streaming POST request")
+                print_verbose("makes anthropic streaming POST request")
                data["stream"] = stream
                response = requests.post(
                    api_base,
@ -211,136 +468,39 @@ def completion(
                        status_code=response.status_code, message=response.text
                    )

-        return response.iter_lines()
+                completion_stream = response.iter_lines()
+                streaming_response = CustomStreamWrapper(
+                    completion_stream=completion_stream,
+                    model=model,
+                    custom_llm_provider="anthropic",
+                    logging_obj=logging_obj,
+                )
+                return streaming_response
+
            else:
-        response = requests.post(api_base, headers=headers, data=json.dumps(data))
+                response = requests.post(
+                    api_base, headers=headers, data=json.dumps(data)
+                )
                if response.status_code != 200:
                    raise AnthropicError(
                        status_code=response.status_code, message=response.text
                    )
-
-        ## LOGGING
-        logging_obj.post_call(
-            input=messages,
-            api_key=api_key,
-            original_response=response.text,
-            additional_args={"complete_input_dict": data},
-        )
-        print_verbose(f"raw model_response: {response.text}")
-        ## RESPONSE OBJECT
-        try:
-            completion_response = response.json()
-        except:
-            raise AnthropicError(
-                message=response.text, status_code=response.status_code
-            )
-        if "error" in completion_response:
-            raise AnthropicError(
-                message=str(completion_response["error"]),
-                status_code=response.status_code,
-            )
-        elif len(completion_response["content"]) == 0:
-            raise AnthropicError(
-                message="No content in response",
-                status_code=response.status_code,
-            )
-        else:
-            text_content = completion_response["content"][0].get("text", None)
-            ## TOOL CALLING - OUTPUT PARSE
-            if text_content is not None and contains_tag("invoke", text_content):
-                function_name = extract_between_tags("tool_name", text_content)[0]
-                function_arguments_str = extract_between_tags("invoke", text_content)[
-                    0
-                ].strip()
-                function_arguments_str = f"<invoke>{function_arguments_str}</invoke>"
-                function_arguments = parse_xml_params(
-                    function_arguments_str,
-                    json_schema=json_schemas.get(
-                        function_name, None
-                    ),  # check if we have a json schema for this function name
-                )
-                _message = litellm.Message(
-                    tool_calls=[
-                        {
-                            "id": f"call_{uuid.uuid4()}",
-                            "type": "function",
-                            "function": {
-                                "name": function_name,
-                                "arguments": json.dumps(function_arguments),
-                            },
-                        }
-                    ],
-                    content=None,
-                )
-                model_response.choices[0].message = _message  # type: ignore
-                model_response._hidden_params["original_response"] = (
-                    text_content  # allow user to access raw anthropic tool calling response
-                )
-            else:
-                model_response.choices[0].message.content = text_content  # type: ignore
-            model_response.choices[0].finish_reason = map_finish_reason(
-                completion_response["stop_reason"]
-            )
-
-        print_verbose(f"_is_function_call: {_is_function_call}; stream: {stream}")
-        if _is_function_call == True and stream is not None and stream == True:
-            print_verbose(f"INSIDE ANTHROPIC STREAMING TOOL CALLING CONDITION BLOCK")
-            # return an iterator
-            streaming_model_response = ModelResponse(stream=True)
-            streaming_model_response.choices[0].finish_reason = model_response.choices[
-                0
-            ].finish_reason
-            # streaming_model_response.choices = [litellm.utils.StreamingChoices()]
-            streaming_choice = litellm.utils.StreamingChoices()
-            streaming_choice.index = model_response.choices[0].index
-            _tool_calls = []
-            print_verbose(
-                f"type of model_response.choices[0]: {type(model_response.choices[0])}"
-            )
-            print_verbose(f"type of streaming_choice: {type(streaming_choice)}")
-            if isinstance(model_response.choices[0], litellm.Choices):
-                if getattr(
-                    model_response.choices[0].message, "tool_calls", None
-                ) is not None and isinstance(
-                    model_response.choices[0].message.tool_calls, list
-                ):
-                    for tool_call in model_response.choices[0].message.tool_calls:
-                        _tool_call = {**tool_call.dict(), "index": 0}
-                        _tool_calls.append(_tool_call)
-                delta_obj = litellm.utils.Delta(
-                    content=getattr(model_response.choices[0].message, "content", None),
-                    role=model_response.choices[0].message.role,
-                    tool_calls=_tool_calls,
-                )
-                streaming_choice.delta = delta_obj
-                streaming_model_response.choices = [streaming_choice]
-                completion_stream = ModelResponseIterator(
-                    model_response=streaming_model_response
-                )
-                print_verbose(
-                    f"Returns anthropic CustomStreamWrapper with 'cached_response' streaming object"
-                )
-                return CustomStreamWrapper(
-                    completion_stream=completion_stream,
+        return self.process_response(
            model=model,
-                    custom_llm_provider="cached_response",
+            response=response,
+            model_response=model_response,
+            _is_function_call=_is_function_call,
+            stream=stream,
            logging_obj=logging_obj,
+            api_key=api_key,
+            data=data,
+            messages=messages,
+            print_verbose=print_verbose,
        )

-        ## CALCULATING USAGE
-        prompt_tokens = completion_response["usage"]["input_tokens"]
-        completion_tokens = completion_response["usage"]["output_tokens"]
-        total_tokens = prompt_tokens + completion_tokens
-
-        model_response["created"] = int(time.time())
-        model_response["model"] = model
-        usage = Usage(
-            prompt_tokens=prompt_tokens,
-            completion_tokens=completion_tokens,
-            total_tokens=prompt_tokens + completion_tokens,
-        )
-        model_response.usage = usage
-        return model_response
+    def embedding(self):
+        # logic for parsing in - calling - parsing out model embedding calls
+        pass


 class ModelResponseIterator:
@ -367,8 +527,3 @@ class ModelResponseIterator:
            raise StopAsyncIteration
        self.is_done = True
        return self.model_response
-
-
-def embedding():
-    # logic for parsing in - calling - parsing out model embedding calls
-    pass
--- a/litellm/llms/anthropic_text.py
+++ b/litellm/llms/anthropic_text.py
@ -4,10 +4,12 @@ from enum import Enum
 import requests
 import time
 from typing import Callable, Optional
-from litellm.utils import ModelResponse, Usage
+from litellm.utils import ModelResponse, Usage, CustomStreamWrapper
 import litellm
 from .prompt_templates.factory import prompt_factory, custom_prompt
 import httpx
+from .base import BaseLLM
+from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler


 class AnthropicConstants(Enum):
@ -94,10 +96,124 @@ def validate_environment(api_key, user_headers):
    return headers


-def completion(
+class AnthropicTextCompletion(BaseLLM):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def process_response(
+        self, model_response: ModelResponse, response, encoding, prompt: str, model: str
+    ):
+        ## RESPONSE OBJECT
+        try:
+            completion_response = response.json()
+        except:
+            raise AnthropicError(
+                message=response.text, status_code=response.status_code
+            )
+        if "error" in completion_response:
+            raise AnthropicError(
+                message=str(completion_response["error"]),
+                status_code=response.status_code,
+            )
+        else:
+            if len(completion_response["completion"]) > 0:
+                model_response["choices"][0]["message"]["content"] = (
+                    completion_response["completion"]
+                )
+            model_response.choices[0].finish_reason = completion_response["stop_reason"]
+
+        ## CALCULATING USAGE
+        prompt_tokens = len(
+            encoding.encode(prompt)
+        )  ##[TODO] use the anthropic tokenizer here
+        completion_tokens = len(
+            encoding.encode(model_response["choices"][0]["message"].get("content", ""))
+        )  ##[TODO] use the anthropic tokenizer here
+
+        model_response["created"] = int(time.time())
+        model_response["model"] = model
+        usage = Usage(
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=prompt_tokens + completion_tokens,
+        )
+        model_response.usage = usage
+
+        return model_response
+
+    async def async_completion(
+        self,
+        model: str,
+        model_response: ModelResponse,
+        api_base: str,
+        logging_obj,
+        encoding,
+        headers: dict,
+        data: dict,
+        client=None,
+    ):
+        if client is None:
+            client = AsyncHTTPHandler(timeout=httpx.Timeout(timeout=600.0, connect=5.0))
+
+        response = await client.post(api_base, headers=headers, data=json.dumps(data))
+
+        if response.status_code != 200:
+            raise AnthropicError(
+                status_code=response.status_code, message=response.text
+            )
+
+        ## LOGGING
+        logging_obj.post_call(
+            input=data["prompt"],
+            api_key=headers.get("x-api-key"),
+            original_response=response.text,
+            additional_args={"complete_input_dict": data},
+        )
+
+        response = self.process_response(
+            model_response=model_response,
+            response=response,
+            encoding=encoding,
+            prompt=data["prompt"],
+            model=model,
+        )
+        return response
+
+    async def async_streaming(
+        self,
+        model: str,
+        api_base: str,
+        logging_obj,
+        headers: dict,
+        data: Optional[dict],
+        client=None,
+    ):
+        if client is None:
+            client = AsyncHTTPHandler(timeout=httpx.Timeout(timeout=600.0, connect=5.0))
+
+        response = await client.post(api_base, headers=headers, data=json.dumps(data))
+
+        if response.status_code != 200:
+            raise AnthropicError(
+                status_code=response.status_code, message=response.text
+            )
+
+        completion_stream = response.aiter_lines()
+
+        streamwrapper = CustomStreamWrapper(
+            completion_stream=completion_stream,
+            model=model,
+            custom_llm_provider="anthropic_text",
+            logging_obj=logging_obj,
+        )
+        return streamwrapper
+
+    def completion(
+        self,
        model: str,
        messages: list,
        api_base: str,
+        acompletion: str,
        custom_prompt_dict: dict,
        model_response: ModelResponse,
        print_verbose: Callable,
@ -108,7 +224,8 @@ def completion(
        litellm_params=None,
        logger_fn=None,
        headers={},
-):
+        client=None,
+    ):
        headers = validate_environment(api_key, headers)
        if model in custom_prompt_dict:
            # check if the model has a registered custom prompt
@ -151,21 +268,53 @@ def completion(

        ## COMPLETION CALL
        if "stream" in optional_params and optional_params["stream"] == True:
-        response = requests.post(
+            if acompletion == True:
+                return self.async_streaming(
+                    model=model,
+                    api_base=api_base,
+                    logging_obj=logging_obj,
+                    headers=headers,
+                    data=data,
+                    client=None,
+                )
+
+            if client is None:
+                client = HTTPHandler(timeout=httpx.Timeout(timeout=600.0, connect=5.0))
+
+            response = client.post(
                api_base,
                headers=headers,
                data=json.dumps(data),
-            stream=optional_params["stream"],
+                # stream=optional_params["stream"],
            )

            if response.status_code != 200:
                raise AnthropicError(
                    status_code=response.status_code, message=response.text
                )
-
-        return response.iter_lines()
+            completion_stream = response.iter_lines()
+            stream_response = CustomStreamWrapper(
+                completion_stream=completion_stream,
+                model=model,
+                custom_llm_provider="anthropic_text",
+                logging_obj=logging_obj,
+            )
+            return stream_response
+        elif acompletion == True:
+            return self.async_completion(
+                model=model,
+                model_response=model_response,
+                api_base=api_base,
+                logging_obj=logging_obj,
+                encoding=encoding,
+                headers=headers,
+                data=data,
+                client=client,
+            )
        else:
-        response = requests.post(api_base, headers=headers, data=json.dumps(data))
+            if client is None:
+                client = HTTPHandler(timeout=httpx.Timeout(timeout=600.0, connect=5.0))
+            response = client.post(api_base, headers=headers, data=json.dumps(data))
            if response.status_code != 200:
                raise AnthropicError(
                    status_code=response.status_code, message=response.text
@ -179,44 +328,16 @@ def completion(
                additional_args={"complete_input_dict": data},
            )
            print_verbose(f"raw model_response: {response.text}")
-        ## RESPONSE OBJECT
-        try:
-            completion_response = response.json()
-        except:
-            raise AnthropicError(
-                message=response.text, status_code=response.status_code
-            )
-        if "error" in completion_response:
-            raise AnthropicError(
-                message=str(completion_response["error"]),
-                status_code=response.status_code,
-            )
-        else:
-            if len(completion_response["completion"]) > 0:
-                model_response["choices"][0]["message"]["content"] = (
-                    completion_response["completion"]
-                )
-            model_response.choices[0].finish_reason = completion_response["stop_reason"]

-        ## CALCULATING USAGE
-        prompt_tokens = len(
-            encoding.encode(prompt)
-        )  ##[TODO] use the anthropic tokenizer here
-        completion_tokens = len(
-            encoding.encode(model_response["choices"][0]["message"].get("content", ""))
-        )  ##[TODO] use the anthropic tokenizer here
-
-        model_response["created"] = int(time.time())
-        model_response["model"] = model
-        usage = Usage(
-            prompt_tokens=prompt_tokens,
-            completion_tokens=completion_tokens,
-            total_tokens=prompt_tokens + completion_tokens,
+            response = self.process_response(
+                model_response=model_response,
+                response=response,
+                encoding=encoding,
+                prompt=data["prompt"],
+                model=model,
            )
-        model_response.usage = usage
-        return model_response
+            return response

-
-def embedding():
+    def embedding(self):
        # logic for parsing in - calling - parsing out model embedding calls
        pass
--- a/litellm/llms/bedrock.py
+++ b/litellm/llms/bedrock.py
@ -746,7 +746,7 @@ def completion(
                    ]
                # Format rest of message according to anthropic guidelines
                messages = prompt_factory(
-                    model=model, messages=messages, custom_llm_provider="anthropic"
+                    model=model, messages=messages, custom_llm_provider="anthropic_xml"
                )
                ## LOAD CONFIG
                config = litellm.AmazonAnthropicClaude3Config.get_config()
@ -1108,6 +1108,7 @@ def completion(

            raise BedrockError(status_code=500, message=traceback.format_exc())

+
 class ModelResponseIterator:
    def __init__(self, model_response):
        self.model_response = model_response
@ -1133,6 +1134,7 @@ class ModelResponseIterator:
        self.is_done = True
        return self.model_response

+
 def _embedding_func_single(
    model: str,
    input: str,
--- a/litellm/llms/custom_httpx/http_handler.py
+++ b/litellm/llms/custom_httpx/http_handler.py
@ -0,0 +1,97 @@
+import httpx, asyncio
+from typing import Optional, Union, Mapping, Any
+
+# https://www.python-httpx.org/advanced/timeouts
+_DEFAULT_TIMEOUT = httpx.Timeout(timeout=5.0, connect=5.0)
+
+
+class AsyncHTTPHandler:
+    def __init__(
+        self, timeout: httpx.Timeout = _DEFAULT_TIMEOUT, concurrent_limit=1000
+    ):
+        # Create a client with a connection pool
+        self.client = httpx.AsyncClient(
+            timeout=timeout,
+            limits=httpx.Limits(
+                max_connections=concurrent_limit,
+                max_keepalive_connections=concurrent_limit,
+            ),
+        )
+
+    async def close(self):
+        # Close the client when you're done with it
+        await self.client.aclose()
+
+    async def __aenter__(self):
+        return self.client
+
+    async def __aexit__(self):
+        # close the client when exiting
+        await self.client.aclose()
+
+    async def get(
+        self, url: str, params: Optional[dict] = None, headers: Optional[dict] = None
+    ):
+        response = await self.client.get(url, params=params, headers=headers)
+        return response
+
+    async def post(
+        self,
+        url: str,
+        data: Optional[Union[dict, str]] = None,  # type: ignore
+        params: Optional[dict] = None,
+        headers: Optional[dict] = None,
+    ):
+        response = await self.client.post(
+            url,
+            data=data,  # type: ignore
+            params=params,
+            headers=headers,
+        )
+        return response
+
+    def __del__(self) -> None:
+        try:
+            asyncio.get_running_loop().create_task(self.close())
+        except Exception:
+            pass
+
+
+class HTTPHandler:
+    def __init__(
+        self, timeout: httpx.Timeout = _DEFAULT_TIMEOUT, concurrent_limit=1000
+    ):
+        # Create a client with a connection pool
+        self.client = httpx.Client(
+            timeout=timeout,
+            limits=httpx.Limits(
+                max_connections=concurrent_limit,
+                max_keepalive_connections=concurrent_limit,
+            ),
+        )
+
+    def close(self):
+        # Close the client when you're done with it
+        self.client.close()
+
+    def get(
+        self, url: str, params: Optional[dict] = None, headers: Optional[dict] = None
+    ):
+        response = self.client.get(url, params=params, headers=headers)
+        return response
+
+    def post(
+        self,
+        url: str,
+        data: Optional[dict] = None,
+        params: Optional[dict] = None,
+        headers: Optional[dict] = None,
+    ):
+        response = self.client.post(url, data=data, params=params, headers=headers)
+        return response
+
+    def __del__(self) -> None:
+        try:
+            self.close()
+        except Exception:
+            pass
--- a/litellm/llms/gemini.py
+++ b/litellm/llms/gemini.py
@ -6,7 +6,8 @@ from typing import Callable, Optional
 from litellm.utils import ModelResponse, get_secret, Choices, Message, Usage
 import litellm
 import sys, httpx
-from .prompt_templates.factory import prompt_factory, custom_prompt
+from .prompt_templates.factory import prompt_factory, custom_prompt, get_system_prompt
+from packaging.version import Version


 class GeminiError(Exception):
@ -103,6 +104,13 @@ class TextStreamer:
                break


+def supports_system_instruction():
+    import google.generativeai as genai
+
+    gemini_pkg_version = Version(genai.__version__)
+    return gemini_pkg_version >= Version("0.5.0")
+
+
 def completion(
    model: str,
    messages: list,
@ -124,7 +132,7 @@ def completion(
            "Importing google.generativeai failed, please run 'pip install -q google-generativeai"
        )
    genai.configure(api_key=api_key)
-
+    system_prompt = ""
    if model in custom_prompt_dict:
        # check if the model has a registered custom prompt
        model_prompt_details = custom_prompt_dict[model]
@ -135,6 +143,7 @@ def completion(
            messages=messages,
        )
    else:
+        system_prompt, messages = get_system_prompt(messages=messages)
        prompt = prompt_factory(
            model=model, messages=messages, custom_llm_provider="gemini"
        )
@ -162,11 +171,20 @@ def completion(
    logging_obj.pre_call(
        input=prompt,
        api_key="",
-        additional_args={"complete_input_dict": {"inference_params": inference_params}},
+        additional_args={
+            "complete_input_dict": {
+                "inference_params": inference_params,
+                "system_prompt": system_prompt,
+            }
+        },
    )
    ## COMPLETION CALL
    try:
-        _model = genai.GenerativeModel(f"models/{model}")
+        _params = {"model_name": "models/{}".format(model)}
+        _system_instruction = supports_system_instruction()
+        if _system_instruction and len(system_prompt) > 0:
+            _params["system_instruction"] = system_prompt
+        _model = genai.GenerativeModel(**_params)
        if stream == True:
            if acompletion == True:

@ -213,11 +231,12 @@ def completion(
                encoding=encoding,
            )
        else:
-            response = _model.generate_content(
-                contents=prompt,
-                generation_config=genai.types.GenerationConfig(**inference_params),
-                safety_settings=safety_settings,
-            )
+            params = {
+                "contents": prompt,
+                "generation_config": genai.types.GenerationConfig(**inference_params),
+                "safety_settings": safety_settings,
+            }
+            response = _model.generate_content(**params)
    except Exception as e:
        raise GeminiError(
            message=str(e),
--- a/litellm/llms/ollama.py
+++ b/litellm/llms/ollama.py
@ -229,7 +229,7 @@ def get_ollama_response(
    model_response["created"] = int(time.time())
    model_response["model"] = "ollama/" + model
    prompt_tokens = response_json.get("prompt_eval_count", len(encoding.encode(prompt)))  # type: ignore
-    completion_tokens = response_json["eval_count"]
+    completion_tokens = response_json.get("eval_count", len(response_json.get("message",dict()).get("content", "")))
    model_response["usage"] = litellm.Usage(
        prompt_tokens=prompt_tokens,
        completion_tokens=completion_tokens,
@ -331,7 +331,7 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
            model_response["created"] = int(time.time())
            model_response["model"] = "ollama/" + data["model"]
            prompt_tokens = response_json.get("prompt_eval_count", len(encoding.encode(data["prompt"])))  # type: ignore
-            completion_tokens = response_json["eval_count"]
+            completion_tokens = response_json.get("eval_count", len(response_json.get("message",dict()).get("content", "")))
            model_response["usage"] = litellm.Usage(
                prompt_tokens=prompt_tokens,
                completion_tokens=completion_tokens,
--- a/litellm/llms/prompt_templates/factory.py
+++ b/litellm/llms/prompt_templates/factory.py
@ -1,9 +1,9 @@
 from enum import Enum
 import requests, traceback
 import json, re, xml.etree.ElementTree as ET
-from jinja2 import Template, exceptions, Environment, meta
+from jinja2 import Template, exceptions, meta, BaseLoader
+from jinja2.sandbox import ImmutableSandboxedEnvironment
 from typing import Optional, Any
-import imghdr, base64
 from typing import List
 import litellm

@ -219,6 +219,15 @@ def phind_codellama_pt(messages):


 def hf_chat_template(model: str, messages: list, chat_template: Optional[Any] = None):
+    # Define Jinja2 environment
+    env = ImmutableSandboxedEnvironment()
+
+    def raise_exception(message):
+        raise Exception(f"Error message - {message}")
+
+    # Create a template object from the template text
+    env.globals["raise_exception"] = raise_exception
+
    ## get the tokenizer config from huggingface
    bos_token = ""
    eos_token = ""
@ -249,12 +258,6 @@ def hf_chat_template(model: str, messages: list, chat_template: Optional[Any] =
        eos_token = tokenizer_config["eos_token"]
        chat_template = tokenizer_config["chat_template"]

-    def raise_exception(message):
-        raise Exception(f"Error message - {message}")
-
-    # Create a template object from the template text
-    env = Environment()
-    env.globals["raise_exception"] = raise_exception
    try:
        template = env.from_string(chat_template)
    except Exception as e:
@ -556,7 +559,9 @@ def convert_to_anthropic_image_obj(openai_image_url: str):
        )


-def convert_to_anthropic_tool_result(message: dict) -> str:
+# The following XML functions will be deprecated once JSON schema support is available on Bedrock and Vertex
+# ------------------------------------------------------------------------------
+def convert_to_anthropic_tool_result_xml(message: dict) -> str:
    """
    OpenAI message with a tool result looks like:
    {
@ -606,7 +611,7 @@ def convert_to_anthropic_tool_result(message: dict) -> str:
    return anthropic_tool_result


-def convert_to_anthropic_tool_invoke(tool_calls: list) -> str:
+def convert_to_anthropic_tool_invoke_xml(tool_calls: list) -> str:
    invokes = ""
    for tool in tool_calls:
        if tool["type"] != "function":
@ -631,7 +636,7 @@ def convert_to_anthropic_tool_invoke(tool_calls: list) -> str:
    return anthropic_tool_invoke


-def anthropic_messages_pt(messages: list):
+def anthropic_messages_pt_xml(messages: list):
    """
    format messages for anthropic
    1. Anthropic supports roles like "user" and "assistant", (here litellm translates system-> assistant)
@ -690,7 +695,7 @@ def anthropic_messages_pt(messages: list):
            if messages[msg_i].get(
                "tool_calls", []
            ):  # support assistant tool invoke convertion
-                assistant_text += convert_to_anthropic_tool_invoke(
+                assistant_text += convert_to_anthropic_tool_invoke(  # type: ignore
                    messages[msg_i]["tool_calls"]
                )

@ -700,7 +705,186 @@ def anthropic_messages_pt(messages: list):
        if assistant_content:
            new_messages.append({"role": "assistant", "content": assistant_content})

-    if new_messages[0]["role"] != "user":
+    if not new_messages or new_messages[0]["role"] != "user":
+        if litellm.modify_params:
+            new_messages.insert(
+                0, {"role": "user", "content": [{"type": "text", "text": "."}]}
+            )
+        else:
+            raise Exception(
+                "Invalid first message. Should always start with 'role'='user' for Anthropic. System prompt is sent separately for Anthropic. set 'litellm.modify_params = True' or 'litellm_settings:modify_params = True' on proxy, to insert a placeholder user message - '.' as the first message, "
+            )
+
+    if new_messages[-1]["role"] == "assistant":
+        for content in new_messages[-1]["content"]:
+            if isinstance(content, dict) and content["type"] == "text":
+                content["text"] = content[
+                    "text"
+                ].rstrip()  # no trailing whitespace for final assistant message
+
+    return new_messages
+
+
+# ------------------------------------------------------------------------------
+
+
+def convert_to_anthropic_tool_result(message: dict) -> dict:
+    """
+    OpenAI message with a tool result looks like:
+    {
+        "tool_call_id": "tool_1",
+        "role": "tool",
+        "name": "get_current_weather",
+        "content": "function result goes here",
+    },
+    """
+
+    """
+    Anthropic tool_results look like:
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "tool_result",
+                "tool_use_id": "toolu_01A09q90qw90lq917835lq9",
+                "content": "ConnectionError: the weather service API is not available (HTTP 500)",
+                # "is_error": true
+            }
+        ]
+    }
+    """
+    tool_call_id = message.get("tool_call_id")
+    content = message.get("content")
+
+    # We can't determine from openai message format whether it's a successful or
+    # error call result so default to the successful result template
+    anthropic_tool_result = {
+        "type": "tool_result",
+        "tool_use_id": tool_call_id,
+        "content": content,
+    }
+
+    return anthropic_tool_result
+
+
+def convert_to_anthropic_tool_invoke(tool_calls: list) -> list:
+    """
+    OpenAI tool invokes:
+    {
+      "role": "assistant",
+      "content": null,
+      "tool_calls": [
+        {
+          "id": "call_abc123",
+          "type": "function",
+          "function": {
+            "name": "get_current_weather",
+            "arguments": "{\n\"location\": \"Boston, MA\"\n}"
+          }
+        }
+      ]
+    },
+    """
+
+    """
+    Anthropic tool invokes:
+    {
+      "role": "assistant",
+      "content": [
+        {
+          "type": "text",
+          "text": "<thinking>To answer this question, I will: 1. Use the get_weather tool to get the current weather in San Francisco. 2. Use the get_time tool to get the current time in the America/Los_Angeles timezone, which covers San Francisco, CA.</thinking>"
+        },
+        {
+          "type": "tool_use",
+          "id": "toolu_01A09q90qw90lq917835lq9",
+          "name": "get_weather",
+          "input": {"location": "San Francisco, CA"}
+        }
+      ]
+    }
+    """
+    anthropic_tool_invoke = [
+        {
+            "type": "tool_use",
+            "id": tool["id"],
+            "name": tool["function"]["name"],
+            "input": json.loads(tool["function"]["arguments"]),
+        }
+        for tool in tool_calls
+        if tool["type"] == "function"
+    ]
+
+    return anthropic_tool_invoke
+
+
+def anthropic_messages_pt(messages: list):
+    """
+    format messages for anthropic
+    1. Anthropic supports roles like "user" and "assistant", (here litellm translates system-> assistant)
+    2. The first message always needs to be of role "user"
+    3. Each message must alternate between "user" and "assistant" (this is not addressed as now by litellm)
+    4. final assistant content cannot end with trailing whitespace (anthropic raises an error otherwise)
+    5. System messages are a separate param to the Messages API
+    6. Ensure we only accept role, content. (message.name is not supported)
+    """
+    # add role=tool support to allow function call result/error submission
+    user_message_types = {"user", "tool"}
+    # reformat messages to ensure user/assistant are alternating, if there's either 2 consecutive 'user' messages or 2 consecutive 'assistant' message, merge them.
+    new_messages = []
+    msg_i = 0
+    while msg_i < len(messages):
+        user_content = []
+        ## MERGE CONSECUTIVE USER CONTENT ##
+        while msg_i < len(messages) and messages[msg_i]["role"] in user_message_types:
+            if isinstance(messages[msg_i]["content"], list):
+                for m in messages[msg_i]["content"]:
+                    if m.get("type", "") == "image_url":
+                        user_content.append(
+                            {
+                                "type": "image",
+                                "source": convert_to_anthropic_image_obj(
+                                    m["image_url"]["url"]
+                                ),
+                            }
+                        )
+                    elif m.get("type", "") == "text":
+                        user_content.append({"type": "text", "text": m["text"]})
+            elif messages[msg_i]["role"] == "tool":
+                # OpenAI's tool message content will always be a string
+                user_content.append(convert_to_anthropic_tool_result(messages[msg_i]))
+            else:
+                user_content.append(
+                    {"type": "text", "text": messages[msg_i]["content"]}
+                )
+
+            msg_i += 1
+
+        if user_content:
+            new_messages.append({"role": "user", "content": user_content})
+
+        assistant_content = []
+        ## MERGE CONSECUTIVE ASSISTANT CONTENT ##
+        while msg_i < len(messages) and messages[msg_i]["role"] == "assistant":
+            assistant_text = (
+                messages[msg_i].get("content") or ""
+            )  # either string or none
+            if assistant_text:
+                assistant_content.append({"type": "text", "text": assistant_text})
+
+            if messages[msg_i].get(
+                "tool_calls", []
+            ):  # support assistant tool invoke convertion
+                assistant_content.extend(
+                    convert_to_anthropic_tool_invoke(messages[msg_i]["tool_calls"])
+                )
+
+            msg_i += 1
+
+        if assistant_content:
+            new_messages.append({"role": "assistant", "content": assistant_content})
+
+    if not new_messages or new_messages[0]["role"] != "user":
        if litellm.modify_params:
            new_messages.insert(
                0, {"role": "user", "content": [{"type": "text", "text": "."}]}
@ -778,7 +962,20 @@ def parse_xml_params(xml_content, json_schema: Optional[dict] = None):
    return params


-###
+### GEMINI HELPER FUNCTIONS ###
+
+
+def get_system_prompt(messages):
+    system_prompt_indices = []
+    system_prompt = ""
+    for idx, message in enumerate(messages):
+        if message["role"] == "system":
+            system_prompt += message["content"]
+            system_prompt_indices.append(idx)
+    if len(system_prompt_indices) > 0:
+        for idx in reversed(system_prompt_indices):
+            messages.pop(idx)
+    return system_prompt, messages


 def convert_openai_message_to_cohere_tool_result(message):
@ -1081,13 +1278,19 @@ def prompt_factory(
        if model == "claude-instant-1" or model == "claude-2":
            return anthropic_pt(messages=messages)
        return anthropic_messages_pt(messages=messages)
+    elif custom_llm_provider == "anthropic_xml":
+        return anthropic_messages_pt_xml(messages=messages)
    elif custom_llm_provider == "together_ai":
        prompt_format, chat_template = get_model_info(token=api_key, model=model)
        return format_prompt_togetherai(
            messages=messages, prompt_format=prompt_format, chat_template=chat_template
        )
    elif custom_llm_provider == "gemini":
-        if model == "gemini-pro-vision":
+        if (
+            model == "gemini-pro-vision"
+            or litellm.supports_vision(model=model)
+            or litellm.supports_vision(model=custom_llm_provider + "/" + model)
+        ):
            return _gemini_vision_convert_messages(messages=messages)
        else:
            return gemini_text_image_pt(messages=messages)
--- a/litellm/llms/replicate.py
+++ b/litellm/llms/replicate.py
@ -332,9 +332,12 @@ def completion(
            model_response["choices"][0]["message"]["content"] = result

        # Calculate usage
-        prompt_tokens = len(encoding.encode(prompt))
+        prompt_tokens = len(encoding.encode(prompt, disallowed_special=()))
        completion_tokens = len(
-            encoding.encode(model_response["choices"][0]["message"].get("content", ""))
+            encoding.encode(
+                model_response["choices"][0]["message"].get("content", ""),
+                disallowed_special=(),
+            )
        )
        model_response["model"] = "replicate/" + model
        usage = Usage(
--- a/litellm/llms/vertex_ai.py
+++ b/litellm/llms/vertex_ai.py
@ -3,10 +3,10 @@ import json
 from enum import Enum
 import requests
 import time
-from typing import Callable, Optional, Union
+from typing import Callable, Optional, Union, List
 from litellm.utils import ModelResponse, Usage, CustomStreamWrapper, map_finish_reason
 import litellm, uuid
-import httpx
+import httpx, inspect


 class VertexAIError(Exception):
@ -25,6 +25,7 @@ class VertexAIError(Exception):
 class VertexAIConfig:
    """
    Reference: https://cloud.google.com/vertex-ai/docs/generative-ai/chat/test-chat-prompts
+    Reference: https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference

    The class `VertexAIConfig` provides configuration for the VertexAI's API interface. Below are the parameters:

@ -36,6 +37,12 @@ class VertexAIConfig:

    - `top_k` (integer): The value of `top_k` determines how many of the most probable tokens are considered in the selection. For example, a `top_k` of 1 means the selected token is the most probable among all tokens. The default value is 40.

+    - `response_mime_type` (str): The MIME type of the response. The default value is 'text/plain'.
+
+    - `candidate_count` (int): Number of generated responses to return.
+
+    - `stop_sequences` (List[str]): The set of character sequences (up to 5) that will stop output generation. If specified, the API will stop at the first appearance of a stop sequence. The stop sequence will not be included as part of the response.
+
    Note: Please make sure to modify the default parameters as required for your use case.
    """

@ -43,6 +50,9 @@ class VertexAIConfig:
    max_output_tokens: Optional[int] = None
    top_p: Optional[float] = None
    top_k: Optional[int] = None
+    response_mime_type: Optional[str] = None
+    candidate_count: Optional[int] = None
+    stop_sequences: Optional[list] = None

    def __init__(
        self,
@ -50,6 +60,9 @@ class VertexAIConfig:
        max_output_tokens: Optional[int] = None,
        top_p: Optional[float] = None,
        top_k: Optional[int] = None,
+        response_mime_type: Optional[str] = None,
+        candidate_count: Optional[int] = None,
+        stop_sequences: Optional[list] = None,
    ) -> None:
        locals_ = locals()
        for key, value in locals_.items():
@ -257,6 +270,7 @@ def completion(
    logging_obj,
    vertex_project=None,
    vertex_location=None,
+    vertex_credentials=None,
    optional_params=None,
    litellm_params=None,
    logger_fn=None,
@ -295,10 +309,47 @@ def completion(
        from google.cloud.aiplatform_v1beta1.types import content as gapic_content_types  # type: ignore
        import google.auth  # type: ignore

+        class ExtendedGenerationConfig(GenerationConfig):
+            """Extended parameters for the generation."""
+
+            def __init__(
+                self,
+                *,
+                temperature: Optional[float] = None,
+                top_p: Optional[float] = None,
+                top_k: Optional[int] = None,
+                candidate_count: Optional[int] = None,
+                max_output_tokens: Optional[int] = None,
+                stop_sequences: Optional[List[str]] = None,
+                response_mime_type: Optional[str] = None,
+            ):
+                args_spec = inspect.getfullargspec(gapic_content_types.GenerationConfig)
+
+                if "response_mime_type" in args_spec.args:
+                    self._raw_generation_config = gapic_content_types.GenerationConfig(
+                        temperature=temperature,
+                        top_p=top_p,
+                        top_k=top_k,
+                        candidate_count=candidate_count,
+                        max_output_tokens=max_output_tokens,
+                        stop_sequences=stop_sequences,
+                        response_mime_type=response_mime_type,
+                    )
+                else:
+                    self._raw_generation_config = gapic_content_types.GenerationConfig(
+                        temperature=temperature,
+                        top_p=top_p,
+                        top_k=top_k,
+                        candidate_count=candidate_count,
+                        max_output_tokens=max_output_tokens,
+                        stop_sequences=stop_sequences,
+                    )
+
        ## Load credentials with the correct quota project ref: https://github.com/googleapis/python-aiplatform/issues/2557#issuecomment-1709284744
        print_verbose(
            f"VERTEX AI: vertex_project={vertex_project}; vertex_location={vertex_location}"
        )
+
        creds, _ = google.auth.default(quota_project_id=vertex_project)
        print_verbose(
            f"VERTEX AI: creds={creds}; google application credentials: {os.getenv('GOOGLE_APPLICATION_CREDENTIALS')}"
@ -417,7 +468,7 @@ def completion(
            return async_completion(**data)

        if mode == "vision":
-            print_verbose("\nMaking VertexAI Gemini Pro Vision Call")
+            print_verbose("\nMaking VertexAI Gemini Pro / Pro Vision Call")
            print_verbose(f"\nProcessing input messages = {messages}")
            tools = optional_params.pop("tools", None)
            prompt, images = _gemini_vision_convert_messages(messages=messages)
@ -436,7 +487,7 @@ def completion(

                model_response = llm_model.generate_content(
                    contents=content,
-                    generation_config=GenerationConfig(**optional_params),
+                    generation_config=ExtendedGenerationConfig(**optional_params),
                    safety_settings=safety_settings,
                    stream=True,
                    tools=tools,
@ -458,7 +509,7 @@ def completion(
            ## LLM Call
            response = llm_model.generate_content(
                contents=content,
-                generation_config=GenerationConfig(**optional_params),
+                generation_config=ExtendedGenerationConfig(**optional_params),
                safety_settings=safety_settings,
                tools=tools,
            )
@ -698,6 +749,43 @@ async def async_completion(
    """
    try:
        from vertexai.preview.generative_models import GenerationConfig
+        from google.cloud.aiplatform_v1beta1.types import content as gapic_content_types  # type: ignore
+
+        class ExtendedGenerationConfig(GenerationConfig):
+            """Extended parameters for the generation."""
+
+            def __init__(
+                self,
+                *,
+                temperature: Optional[float] = None,
+                top_p: Optional[float] = None,
+                top_k: Optional[int] = None,
+                candidate_count: Optional[int] = None,
+                max_output_tokens: Optional[int] = None,
+                stop_sequences: Optional[List[str]] = None,
+                response_mime_type: Optional[str] = None,
+            ):
+                args_spec = inspect.getfullargspec(gapic_content_types.GenerationConfig)
+
+                if "response_mime_type" in args_spec.args:
+                    self._raw_generation_config = gapic_content_types.GenerationConfig(
+                        temperature=temperature,
+                        top_p=top_p,
+                        top_k=top_k,
+                        candidate_count=candidate_count,
+                        max_output_tokens=max_output_tokens,
+                        stop_sequences=stop_sequences,
+                        response_mime_type=response_mime_type,
+                    )
+                else:
+                    self._raw_generation_config = gapic_content_types.GenerationConfig(
+                        temperature=temperature,
+                        top_p=top_p,
+                        top_k=top_k,
+                        candidate_count=candidate_count,
+                        max_output_tokens=max_output_tokens,
+                        stop_sequences=stop_sequences,
+                    )

        if mode == "vision":
            print_verbose("\nMaking VertexAI Gemini Pro Vision Call")
@ -721,7 +809,7 @@ async def async_completion(
            ## LLM Call
            response = await llm_model._generate_content_async(
                contents=content,
-                generation_config=GenerationConfig(**optional_params),
+                generation_config=ExtendedGenerationConfig(**optional_params),
                tools=tools,
            )

@ -906,6 +994,43 @@ async def async_streaming(
    Add support for async streaming calls for gemini-pro
    """
    from vertexai.preview.generative_models import GenerationConfig
+    from google.cloud.aiplatform_v1beta1.types import content as gapic_content_types  # type: ignore
+
+    class ExtendedGenerationConfig(GenerationConfig):
+        """Extended parameters for the generation."""
+
+        def __init__(
+            self,
+            *,
+            temperature: Optional[float] = None,
+            top_p: Optional[float] = None,
+            top_k: Optional[int] = None,
+            candidate_count: Optional[int] = None,
+            max_output_tokens: Optional[int] = None,
+            stop_sequences: Optional[List[str]] = None,
+            response_mime_type: Optional[str] = None,
+        ):
+            args_spec = inspect.getfullargspec(gapic_content_types.GenerationConfig)
+
+            if "response_mime_type" in args_spec.args:
+                self._raw_generation_config = gapic_content_types.GenerationConfig(
+                    temperature=temperature,
+                    top_p=top_p,
+                    top_k=top_k,
+                    candidate_count=candidate_count,
+                    max_output_tokens=max_output_tokens,
+                    stop_sequences=stop_sequences,
+                    response_mime_type=response_mime_type,
+                )
+            else:
+                self._raw_generation_config = gapic_content_types.GenerationConfig(
+                    temperature=temperature,
+                    top_p=top_p,
+                    top_k=top_k,
+                    candidate_count=candidate_count,
+                    max_output_tokens=max_output_tokens,
+                    stop_sequences=stop_sequences,
+                )

    if mode == "vision":
        stream = optional_params.pop("stream")
@ -927,7 +1052,7 @@ async def async_streaming(

        response = await llm_model._generate_content_streaming_async(
            contents=content,
-            generation_config=GenerationConfig(**optional_params),
+            generation_config=ExtendedGenerationConfig(**optional_params),
            tools=tools,
        )
        optional_params["stream"] = True
--- a/litellm/llms/vertex_ai_anthropic.py
+++ b/litellm/llms/vertex_ai_anthropic.py
@ -0,0 +1,469 @@
+# What is this?
+## Handler file for calling claude-3 on vertex ai
+import os, types
+import json
+from enum import Enum
+import requests, copy
+import time, uuid
+from typing import Callable, Optional, List
+from litellm.utils import ModelResponse, Usage, map_finish_reason, CustomStreamWrapper
+import litellm
+from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
+from .prompt_templates.factory import (
+    contains_tag,
+    prompt_factory,
+    custom_prompt,
+    construct_tool_use_system_prompt,
+    extract_between_tags,
+    parse_xml_params,
+)
+import httpx
+
+
+class VertexAIError(Exception):
+    def __init__(self, status_code, message):
+        self.status_code = status_code
+        self.message = message
+        self.request = httpx.Request(
+            method="POST", url=" https://cloud.google.com/vertex-ai/"
+        )
+        self.response = httpx.Response(status_code=status_code, request=self.request)
+        super().__init__(
+            self.message
+        )  # Call the base class constructor with the parameters it needs
+
+
+class VertexAIAnthropicConfig:
+    """
+    Reference: https://docs.anthropic.com/claude/reference/messages_post
+
+    Note that the API for Claude on Vertex differs from the Anthropic API documentation in the following ways:
+
+    - `model` is not a valid parameter. The model is instead specified in the Google Cloud endpoint URL.
+    - `anthropic_version` is a required parameter and must be set to "vertex-2023-10-16".
+
+    The class `VertexAIAnthropicConfig` provides configuration for the VertexAI's Anthropic API interface. Below are the parameters:
+
+    - `max_tokens` Required (integer) max tokens,
+    - `anthropic_version` Required (string) version of anthropic for bedrock - e.g. "bedrock-2023-05-31"
+    - `system` Optional (string) the system prompt, conversion from openai format to this is handled in factory.py
+    - `temperature` Optional (float) The amount of randomness injected into the response
+    - `top_p` Optional (float) Use nucleus sampling.
+    - `top_k` Optional (int) Only sample from the top K options for each subsequent token
+    - `stop_sequences` Optional (List[str]) Custom text sequences that cause the model to stop generating
+
+    Note: Please make sure to modify the default parameters as required for your use case.
+    """
+
+    max_tokens: Optional[int] = (
+        4096  # anthropic max - setting this doesn't impact response, but is required by anthropic.
+    )
+    system: Optional[str] = None
+    temperature: Optional[float] = None
+    top_p: Optional[float] = None
+    top_k: Optional[int] = None
+    stop_sequences: Optional[List[str]] = None
+
+    def __init__(
+        self,
+        max_tokens: Optional[int] = None,
+        anthropic_version: Optional[str] = None,
+    ) -> None:
+        locals_ = locals()
+        for key, value in locals_.items():
+            if key == "max_tokens" and value is None:
+                value = self.max_tokens
+            if key != "self" and value is not None:
+                setattr(self.__class__, key, value)
+
+    @classmethod
+    def get_config(cls):
+        return {
+            k: v
+            for k, v in cls.__dict__.items()
+            if not k.startswith("__")
+            and not isinstance(
+                v,
+                (
+                    types.FunctionType,
+                    types.BuiltinFunctionType,
+                    classmethod,
+                    staticmethod,
+                ),
+            )
+            and v is not None
+        }
+
+    def get_supported_openai_params(self):
+        return [
+            "max_tokens",
+            "tools",
+            "tool_choice",
+            "stream",
+            "stop",
+            "temperature",
+            "top_p",
+        ]
+
+    def map_openai_params(self, non_default_params: dict, optional_params: dict):
+        for param, value in non_default_params.items():
+            if param == "max_tokens":
+                optional_params["max_tokens"] = value
+            if param == "tools":
+                optional_params["tools"] = value
+            if param == "stream":
+                optional_params["stream"] = value
+            if param == "stop":
+                optional_params["stop_sequences"] = value
+            if param == "temperature":
+                optional_params["temperature"] = value
+            if param == "top_p":
+                optional_params["top_p"] = value
+        return optional_params
+
+
+"""
+- Run client init 
+- Support async completion, streaming
+"""
+
+
+# makes headers for API call
+def refresh_auth(
+    credentials,
+) -> str:  # used when user passes in credentials as json string
+    from google.auth.transport.requests import Request  # type: ignore[import-untyped]
+
+    if credentials.token is None:
+        credentials.refresh(Request())
+
+    if not credentials.token:
+        raise RuntimeError("Could not resolve API token from the credentials")
+
+    return credentials.token
+
+
+def completion(
+    model: str,
+    messages: list,
+    model_response: ModelResponse,
+    print_verbose: Callable,
+    encoding,
+    logging_obj,
+    vertex_project=None,
+    vertex_location=None,
+    vertex_credentials=None,
+    optional_params=None,
+    litellm_params=None,
+    logger_fn=None,
+    acompletion: bool = False,
+    client=None,
+):
+    try:
+        import vertexai
+        from anthropic import AnthropicVertex
+    except:
+        raise VertexAIError(
+            status_code=400,
+            message="""vertexai import failed please run `pip install -U google-cloud-aiplatform "anthropic[vertex]"`""",
+        )
+
+    if not (
+        hasattr(vertexai, "preview") or hasattr(vertexai.preview, "language_models")
+    ):
+        raise VertexAIError(
+            status_code=400,
+            message="""Upgrade vertex ai. Run `pip install "google-cloud-aiplatform>=1.38"`""",
+        )
+    try:
+
+        ## Load Config
+        config = litellm.VertexAIAnthropicConfig.get_config()
+        for k, v in config.items():
+            if k not in optional_params:
+                optional_params[k] = v
+
+        ## Format Prompt
+        _is_function_call = False
+        messages = copy.deepcopy(messages)
+        optional_params = copy.deepcopy(optional_params)
+        # Separate system prompt from rest of message
+        system_prompt_indices = []
+        system_prompt = ""
+        for idx, message in enumerate(messages):
+            if message["role"] == "system":
+                system_prompt += message["content"]
+                system_prompt_indices.append(idx)
+        if len(system_prompt_indices) > 0:
+            for idx in reversed(system_prompt_indices):
+                messages.pop(idx)
+        if len(system_prompt) > 0:
+            optional_params["system"] = system_prompt
+        # Format rest of message according to anthropic guidelines
+        try:
+            messages = prompt_factory(
+                model=model, messages=messages, custom_llm_provider="anthropic_xml"
+            )
+        except Exception as e:
+            raise VertexAIError(status_code=400, message=str(e))
+
+        ## Handle Tool Calling
+        if "tools" in optional_params:
+            _is_function_call = True
+            tool_calling_system_prompt = construct_tool_use_system_prompt(
+                tools=optional_params["tools"]
+            )
+            optional_params["system"] = (
+                optional_params.get("system", "\n") + tool_calling_system_prompt
+            )  # add the anthropic tool calling prompt to the system prompt
+            optional_params.pop("tools")
+
+        stream = optional_params.pop("stream", None)
+
+        data = {
+            "model": model,
+            "messages": messages,
+            **optional_params,
+        }
+        print_verbose(f"_is_function_call: {_is_function_call}")
+
+        ## Completion Call
+
+        print_verbose(
+            f"VERTEX AI: vertex_project={vertex_project}; vertex_location={vertex_location}; vertex_credentials={vertex_credentials}"
+        )
+        access_token = None
+        if client is None:
+            if vertex_credentials is not None and isinstance(vertex_credentials, str):
+                import google.oauth2.service_account
+
+                json_obj = json.loads(vertex_credentials)
+
+                creds = (
+                    google.oauth2.service_account.Credentials.from_service_account_info(
+                        json_obj,
+                        scopes=["https://www.googleapis.com/auth/cloud-platform"],
+                    )
+                )
+                ### CHECK IF ACCESS
+                access_token = refresh_auth(credentials=creds)
+
+            vertex_ai_client = AnthropicVertex(
+                project_id=vertex_project,
+                region=vertex_location,
+                access_token=access_token,
+            )
+        else:
+            vertex_ai_client = client
+
+        if acompletion == True:
+            """
+            - async streaming
+            - async completion
+            """
+            if stream is not None and stream == True:
+                return async_streaming(
+                    model=model,
+                    messages=messages,
+                    data=data,
+                    print_verbose=print_verbose,
+                    model_response=model_response,
+                    logging_obj=logging_obj,
+                    vertex_project=vertex_project,
+                    vertex_location=vertex_location,
+                    optional_params=optional_params,
+                    client=client,
+                    access_token=access_token,
+                )
+            else:
+                return async_completion(
+                    model=model,
+                    messages=messages,
+                    data=data,
+                    print_verbose=print_verbose,
+                    model_response=model_response,
+                    logging_obj=logging_obj,
+                    vertex_project=vertex_project,
+                    vertex_location=vertex_location,
+                    optional_params=optional_params,
+                    client=client,
+                    access_token=access_token,
+                )
+        if stream is not None and stream == True:
+            ## LOGGING
+            logging_obj.pre_call(
+                input=messages,
+                api_key=None,
+                additional_args={
+                    "complete_input_dict": optional_params,
+                },
+            )
+            response = vertex_ai_client.messages.create(**data, stream=True)  # type: ignore
+            return response
+
+        ## LOGGING
+        logging_obj.pre_call(
+            input=messages,
+            api_key=None,
+            additional_args={
+                "complete_input_dict": optional_params,
+            },
+        )
+
+        message = vertex_ai_client.messages.create(**data)  # type: ignore
+        text_content = message.content[0].text
+        ## TOOL CALLING - OUTPUT PARSE
+        if text_content is not None and contains_tag("invoke", text_content):
+            function_name = extract_between_tags("tool_name", text_content)[0]
+            function_arguments_str = extract_between_tags("invoke", text_content)[
+                0
+            ].strip()
+            function_arguments_str = f"<invoke>{function_arguments_str}</invoke>"
+            function_arguments = parse_xml_params(function_arguments_str)
+            _message = litellm.Message(
+                tool_calls=[
+                    {
+                        "id": f"call_{uuid.uuid4()}",
+                        "type": "function",
+                        "function": {
+                            "name": function_name,
+                            "arguments": json.dumps(function_arguments),
+                        },
+                    }
+                ],
+                content=None,
+            )
+            model_response.choices[0].message = _message  # type: ignore
+        else:
+            model_response.choices[0].message.content = text_content  # type: ignore
+        model_response.choices[0].finish_reason = map_finish_reason(message.stop_reason)
+
+        ## CALCULATING USAGE
+        prompt_tokens = message.usage.input_tokens
+        completion_tokens = message.usage.output_tokens
+
+        model_response["created"] = int(time.time())
+        model_response["model"] = model
+        usage = Usage(
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=prompt_tokens + completion_tokens,
+        )
+        model_response.usage = usage
+        return model_response
+    except Exception as e:
+        raise VertexAIError(status_code=500, message=str(e))
+
+
+async def async_completion(
+    model: str,
+    messages: list,
+    data: dict,
+    model_response: ModelResponse,
+    print_verbose: Callable,
+    logging_obj,
+    vertex_project=None,
+    vertex_location=None,
+    optional_params=None,
+    client=None,
+    access_token=None,
+):
+    from anthropic import AsyncAnthropicVertex
+
+    if client is None:
+        vertex_ai_client = AsyncAnthropicVertex(
+            project_id=vertex_project, region=vertex_location, access_token=access_token
+        )
+    else:
+        vertex_ai_client = client
+
+    ## LOGGING
+    logging_obj.pre_call(
+        input=messages,
+        api_key=None,
+        additional_args={
+            "complete_input_dict": optional_params,
+        },
+    )
+    message = await vertex_ai_client.messages.create(**data)  # type: ignore
+    text_content = message.content[0].text
+    ## TOOL CALLING - OUTPUT PARSE
+    if text_content is not None and contains_tag("invoke", text_content):
+        function_name = extract_between_tags("tool_name", text_content)[0]
+        function_arguments_str = extract_between_tags("invoke", text_content)[0].strip()
+        function_arguments_str = f"<invoke>{function_arguments_str}</invoke>"
+        function_arguments = parse_xml_params(function_arguments_str)
+        _message = litellm.Message(
+            tool_calls=[
+                {
+                    "id": f"call_{uuid.uuid4()}",
+                    "type": "function",
+                    "function": {
+                        "name": function_name,
+                        "arguments": json.dumps(function_arguments),
+                    },
+                }
+            ],
+            content=None,
+        )
+        model_response.choices[0].message = _message  # type: ignore
+    else:
+        model_response.choices[0].message.content = text_content  # type: ignore
+    model_response.choices[0].finish_reason = map_finish_reason(message.stop_reason)
+
+    ## CALCULATING USAGE
+    prompt_tokens = message.usage.input_tokens
+    completion_tokens = message.usage.output_tokens
+
+    model_response["created"] = int(time.time())
+    model_response["model"] = model
+    usage = Usage(
+        prompt_tokens=prompt_tokens,
+        completion_tokens=completion_tokens,
+        total_tokens=prompt_tokens + completion_tokens,
+    )
+    model_response.usage = usage
+    return model_response
+
+
+async def async_streaming(
+    model: str,
+    messages: list,
+    data: dict,
+    model_response: ModelResponse,
+    print_verbose: Callable,
+    logging_obj,
+    vertex_project=None,
+    vertex_location=None,
+    optional_params=None,
+    client=None,
+    access_token=None,
+):
+    from anthropic import AsyncAnthropicVertex
+
+    if client is None:
+        vertex_ai_client = AsyncAnthropicVertex(
+            project_id=vertex_project, region=vertex_location, access_token=access_token
+        )
+    else:
+        vertex_ai_client = client
+
+    ## LOGGING
+    logging_obj.pre_call(
+        input=messages,
+        api_key=None,
+        additional_args={
+            "complete_input_dict": optional_params,
+        },
+    )
+    response = await vertex_ai_client.messages.create(**data, stream=True)  # type: ignore
+    logging_obj.post_call(input=messages, api_key=None, original_response=response)
+
+    streamwrapper = CustomStreamWrapper(
+        completion_stream=response,
+        model=model,
+        custom_llm_provider="vertex_ai",
+        logging_obj=logging_obj,
+    )
+
+    return streamwrapper
--- a/litellm/main.py
+++ b/litellm/main.py
@ -12,8 +12,8 @@ from typing import Any, Literal, Union, BinaryIO
 from functools import partial
 import dotenv, traceback, random, asyncio, time, contextvars
 from copy import deepcopy
-import httpx

+import httpx
 import litellm
 from ._logging import verbose_logger
 from litellm import (  # type: ignore
@ -39,7 +39,6 @@ from litellm.utils import (
    get_optional_params_image_gen,
 )
 from .llms import (
-    anthropic,
    anthropic_text,
    together_ai,
    ai21,
@ -62,11 +61,14 @@ from .llms import (
    palm,
    gemini,
    vertex_ai,
+    vertex_ai_anthropic,
    maritalk,
 )
 from .llms.openai import OpenAIChatCompletion, OpenAITextCompletion
 from .llms.azure import AzureChatCompletion
 from .llms.azure_text import AzureTextCompletion
+from .llms.anthropic import AnthropicChatCompletion
+from .llms.anthropic_text import AnthropicTextCompletion
 from .llms.huggingface_restapi import Huggingface
 from .llms.prompt_templates.factory import (
    prompt_factory,
@ -98,6 +100,8 @@ from litellm.utils import (
 dotenv.load_dotenv()  # Loading env variables using dotenv
 openai_chat_completions = OpenAIChatCompletion()
 openai_text_completions = OpenAITextCompletion()
+anthropic_chat_completions = AnthropicChatCompletion()
+anthropic_text_completions = AnthropicTextCompletion()
 azure_chat_completions = AzureChatCompletion()
 azure_text_completions = AzureTextCompletion()
 huggingface = Huggingface()
@ -303,6 +307,7 @@ async def acompletion(
            or custom_llm_provider == "vertex_ai"
            or custom_llm_provider == "gemini"
            or custom_llm_provider == "sagemaker"
+            or custom_llm_provider == "anthropic"
            or custom_llm_provider in litellm.openai_compatible_providers
        ):  # currently implemented aiohttp calls for just azure, openai, hf, ollama, vertex ai soon all.
            init_response = await loop.run_in_executor(None, func_with_context)
@ -314,6 +319,14 @@ async def acompletion(
                response = await init_response
            else:
                response = init_response  # type: ignore
+
+            if custom_llm_provider == "text-completion-openai" and isinstance(
+                response, TextCompletionResponse
+            ):
+                response = litellm.OpenAITextCompletionConfig().convert_to_chat_model_response_object(
+                    response_object=response,
+                    model_response_object=litellm.ModelResponse(),
+                )
        else:
            # Call the synchronous function using run_in_executor
            response = await loop.run_in_executor(None, func_with_context)  # type: ignore
@ -608,6 +621,7 @@ def completion(
        "cache",
        "no-log",
        "base_model",
+        "stream_timeout",
    ]
    default_params = openai_params + litellm_params
    non_default_params = {
@ -1058,6 +1072,7 @@ def completion(
                api_key=api_key,
                api_base=api_base,
                acompletion=acompletion,
+                client=client,  # pass AsyncOpenAI, OpenAI client
                logging_obj=logging,
                optional_params=optional_params,
                litellm_params=litellm_params,
@ -1153,10 +1168,11 @@ def completion(
                    or get_secret("ANTHROPIC_API_BASE")
                    or "https://api.anthropic.com/v1/complete"
                )
-                response = anthropic_text.completion(
+                response = anthropic_text_completions.completion(
                    model=model,
                    messages=messages,
                    api_base=api_base,
+                    acompletion=acompletion,
                    custom_prompt_dict=litellm.custom_prompt_dict,
                    model_response=model_response,
                    print_verbose=print_verbose,
@ -1177,10 +1193,11 @@ def completion(
                    or get_secret("ANTHROPIC_API_BASE")
                    or "https://api.anthropic.com/v1/messages"
                )
-                response = anthropic.completion(
+                response = anthropic_chat_completions.completion(
                    model=model,
                    messages=messages,
                    api_base=api_base,
+                    acompletion=acompletion,
                    custom_prompt_dict=litellm.custom_prompt_dict,
                    model_response=model_response,
                    print_verbose=print_verbose,
@ -1192,19 +1209,6 @@ def completion(
                    logging_obj=logging,
                    headers=headers,
                )
-            if (
-                "stream" in optional_params
-                and optional_params["stream"] == True
-                and not isinstance(response, CustomStreamWrapper)
-            ):
-                # don't try to access stream object,
-                response = CustomStreamWrapper(
-                    response,
-                    model,
-                    custom_llm_provider="anthropic",
-                    logging_obj=logging,
-                )
-
            if optional_params.get("stream", False) or acompletion == True:
                ## LOGGING
                logging.post_call(
@ -1673,7 +1677,28 @@ def completion(
                or litellm.vertex_location
                or get_secret("VERTEXAI_LOCATION")
            )
-
+            vertex_credentials = (
+                optional_params.pop("vertex_credentials", None)
+                or optional_params.pop("vertex_ai_credentials", None)
+                or get_secret("VERTEXAI_CREDENTIALS")
+            )
+            if "claude-3" in model:
+                model_response = vertex_ai_anthropic.completion(
+                    model=model,
+                    messages=messages,
+                    model_response=model_response,
+                    print_verbose=print_verbose,
+                    optional_params=optional_params,
+                    litellm_params=litellm_params,
+                    logger_fn=logger_fn,
+                    encoding=encoding,
+                    vertex_location=vertex_ai_location,
+                    vertex_project=vertex_ai_project,
+                    vertex_credentials=vertex_credentials,
+                    logging_obj=logging,
+                    acompletion=acompletion,
+                )
+            else:
                model_response = vertex_ai.completion(
                    model=model,
                    messages=messages,
@ -3767,6 +3792,9 @@ async def ahealth_check(

            api_base = model_params.get("api_base") or get_secret("OPENAI_API_BASE")

+            if custom_llm_provider == "text-completion-openai":
+                mode = "completion"
+
            response = await openai_chat_completions.ahealth_check(
                model=model,
                messages=model_params.get(
@ -3800,11 +3828,15 @@ async def ahealth_check(
        return response
    except Exception as e:
        traceback.print_exc()
+        stack_trace = traceback.format_exc()
+        if isinstance(stack_trace, str):
+            stack_trace = stack_trace[:1000]
        if model not in litellm.model_cost and mode is None:
            raise Exception(
                "Missing `mode`. Set the `mode` for the model - https://docs.litellm.ai/docs/proxy/health#embedding-models"
            )
-        return {"error": f"{str(e)}"}
+        error_to_return = str(e) + " stack trace: " + stack_trace
+        return {"error": error_to_return}


 ####### HELPER FUNCTIONS ################
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -11,7 +11,7 @@
    },
    "gpt-4-turbo-preview": {
        "max_tokens": 4096,
-        "max_input_tokens": 8192,
+        "max_input_tokens": 128000,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.00001,
        "output_cost_per_token": 0.00003,
@ -66,6 +66,28 @@
        "litellm_provider": "openai",
        "mode": "chat"
    },
+    "gpt-4-turbo": {
+        "max_tokens": 4096,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00001,
+        "output_cost_per_token": 0.00003,
+        "litellm_provider": "openai",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "supports_parallel_function_calling": true
+    },
+    "gpt-4-turbo-2024-04-09": {
+        "max_tokens": 4096,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00001,
+        "output_cost_per_token": 0.00003,
+        "litellm_provider": "openai",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "supports_parallel_function_calling": true
+    },
    "gpt-4-1106-preview": {
        "max_tokens": 4096,
        "max_input_tokens": 128000,
@ -95,7 +117,8 @@
        "input_cost_per_token": 0.00001,
        "output_cost_per_token": 0.00003,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_vision": true
    },
    "gpt-4-1106-vision-preview": {
        "max_tokens": 4096,
@ -104,7 +127,8 @@
        "input_cost_per_token": 0.00001,
        "output_cost_per_token": 0.00003,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_vision": true
    },
    "gpt-3.5-turbo": {
        "max_tokens": 4097,
@ -187,6 +211,7 @@
    "text-embedding-3-large": {
        "max_tokens": 8191,
        "max_input_tokens": 8191,
+        "output_vector_size": 3072,
        "input_cost_per_token": 0.00000013,
        "output_cost_per_token": 0.000000,
        "litellm_provider": "openai",
@ -195,6 +220,7 @@
    "text-embedding-3-small": {
        "max_tokens": 8191,
        "max_input_tokens": 8191,
+        "output_vector_size": 1536, 
        "input_cost_per_token": 0.00000002,
        "output_cost_per_token": 0.000000,
        "litellm_provider": "openai",
@ -203,6 +229,7 @@
    "text-embedding-ada-002": {
        "max_tokens": 8191,
        "max_input_tokens": 8191,
+        "output_vector_size": 1536, 
        "input_cost_per_token": 0.0000001,
        "output_cost_per_token": 0.000000,
        "litellm_provider": "openai",
@ -387,7 +414,8 @@
        "input_cost_per_token": 0.00001,
        "output_cost_per_token": 0.00003,
        "litellm_provider": "azure", 
-        "mode": "chat"
+        "mode": "chat",
+        "supports_vision": true
    },
    "azure/gpt-35-turbo-16k-0613": {
        "max_tokens": 4096,
@ -474,6 +502,16 @@
        "mode": "chat",
        "supports_function_calling": true
    },
+    "azure/command-r-plus": {
+        "max_tokens": 4096, 
+        "max_input_tokens": 128000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000015,
+        "litellm_provider": "azure",
+        "mode": "chat",
+        "supports_function_calling": true
+    },
    "azure/ada": {
        "max_tokens": 8191,
        "max_input_tokens": 8191,
@ -682,7 +720,8 @@
        "input_cost_per_token": 0.00000070,
        "output_cost_per_token": 0.00000080,
        "litellm_provider": "groq",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_function_calling": true
    },
    "groq/mixtral-8x7b-32768": {
        "max_tokens": 32768,
@ -691,7 +730,8 @@
        "input_cost_per_token": 0.00000027,
        "output_cost_per_token": 0.00000027,
        "litellm_provider": "groq",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_function_calling": true
    },
    "groq/gemma-7b-it": {
        "max_tokens": 8192,
@ -700,7 +740,8 @@
        "input_cost_per_token": 0.00000010,
        "output_cost_per_token": 0.00000010,
        "litellm_provider": "groq",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_function_calling": true
    },
    "claude-instant-1.2": {
        "max_tokens": 8191,
@ -938,6 +979,28 @@
        "supports_function_calling": true,
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
    },
+    "gemini-1.0-pro-001": { 
+        "max_tokens": 8192,
+        "max_input_tokens": 32760,
+        "max_output_tokens": 8192,
+        "input_cost_per_token": 0.00000025, 
+        "output_cost_per_token": 0.0000005,
+        "litellm_provider": "vertex_ai-language-models",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
+    },
+    "gemini-1.0-pro-002": { 
+        "max_tokens": 8192,
+        "max_input_tokens": 32760,
+        "max_output_tokens": 8192,
+        "input_cost_per_token": 0.00000025, 
+        "output_cost_per_token": 0.0000005,
+        "litellm_provider": "vertex_ai-language-models",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
+    },
    "gemini-1.5-pro": { 
        "max_tokens": 8192,
        "max_input_tokens": 1000000,
@ -960,6 +1023,28 @@
        "supports_function_calling": true,
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
    },
+    "gemini-1.5-pro-preview-0409": {
+        "max_tokens": 8192,
+        "max_input_tokens": 1000000,
+        "max_output_tokens": 8192,
+        "input_cost_per_token": 0,
+        "output_cost_per_token": 0,
+        "litellm_provider": "vertex_ai-language-models",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
+    },
+    "gemini-experimental": {
+        "max_tokens": 8192,
+        "max_input_tokens": 1000000,
+        "max_output_tokens": 8192,
+        "input_cost_per_token": 0,
+        "output_cost_per_token": 0,
+        "litellm_provider": "vertex_ai-language-models",
+        "mode": "chat",
+        "supports_function_calling": false,
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
+    },
    "gemini-pro-vision": {
        "max_tokens": 2048,
        "max_input_tokens": 16384,
@ -972,6 +1057,7 @@
        "litellm_provider": "vertex_ai-vision-models",
        "mode": "chat",
        "supports_function_calling": true,
+        "supports_vision": true,
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
    },
    "gemini-1.0-pro-vision": {
@ -986,6 +1072,7 @@
        "litellm_provider": "vertex_ai-vision-models",
        "mode": "chat",
        "supports_function_calling": true,
+        "supports_vision": true,
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
    },
    "gemini-1.0-pro-vision-001": {
@ -1000,21 +1087,35 @@
        "litellm_provider": "vertex_ai-vision-models",
        "mode": "chat",
        "supports_function_calling": true,
+        "supports_vision": true,
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
    },
-    "gemini-1.5-pro-vision": {
-        "max_tokens": 8192,
-        "max_input_tokens": 1000000,
-        "max_output_tokens": 8192,
-        "max_images_per_prompt": 16,
-        "max_videos_per_prompt": 1,
-        "max_video_length": 2,
-        "input_cost_per_token": 0, 
-        "output_cost_per_token": 0,
-        "litellm_provider": "vertex_ai-vision-models",
-        "mode": "chat",
-        "supports_function_calling": true,
-        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
+    "vertex_ai/claude-3-sonnet@20240229": {
+        "max_tokens": 4096,
+        "max_input_tokens": 200000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000015,
+        "litellm_provider": "vertex_ai-anthropic_models",
+        "mode": "chat"
+    },
+    "vertex_ai/claude-3-haiku@20240307": {
+        "max_tokens": 4096, 
+        "max_input_tokens": 200000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00000025,
+        "output_cost_per_token": 0.00000125,
+        "litellm_provider": "vertex_ai-anthropic_models",
+        "mode": "chat"
+    },
+    "vertex_ai/claude-3-opus@20240229": {
+        "max_tokens": 4096,
+        "max_input_tokens": 200000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.0000015,
+        "output_cost_per_token": 0.0000075,
+        "litellm_provider": "vertex_ai-anthropic_models",
+        "mode": "chat"
    },
    "textembedding-gecko": {
        "max_tokens": 3072,
@ -1066,6 +1167,27 @@
        "mode": "embedding",
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
    },
+    "text-embedding-preview-0409": {
+        "max_tokens": 3072,
+        "max_input_tokens": 3072,
+        "output_vector_size": 768,
+        "input_cost_per_token": 0.00000000625,
+        "input_cost_per_token_batch_requests": 0.000000005,
+        "output_cost_per_token": 0,
+        "litellm_provider": "vertex_ai-embedding-models",
+        "mode": "embedding",
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/pricing"
+    },
+    "text-multilingual-embedding-preview-0409":{
+        "max_tokens": 3072,
+        "max_input_tokens": 3072,
+        "output_vector_size": 768,
+        "input_cost_per_token": 0.00000000625,
+        "output_cost_per_token": 0,
+        "litellm_provider": "vertex_ai-embedding-models",
+        "mode": "embedding",
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
+    },
    "palm/chat-bison": {
        "max_tokens": 4096,
        "max_input_tokens": 8192,
@ -1157,17 +1279,7 @@
        "litellm_provider": "gemini",
        "mode": "chat",
        "supports_function_calling": true,
-        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
-    },
-     "gemini/gemini-1.5-pro-vision": {
-        "max_tokens": 8192,
-        "max_input_tokens": 1000000,
-        "max_output_tokens": 8192,
-        "input_cost_per_token": 0, 
-        "output_cost_per_token": 0,
-        "litellm_provider": "gemini",
-        "mode": "chat",
-        "supports_function_calling": true,
+        "supports_vision": true,
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
    },
    "command-r": {
@ -1189,6 +1301,16 @@
        "litellm_provider": "cohere_chat",
        "mode": "chat"
    },
+    "command-r-plus": {
+        "max_tokens": 4096, 
+        "max_input_tokens": 128000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000015,
+        "litellm_provider": "cohere_chat",
+        "mode": "chat",
+        "supports_function_calling": true
+    },
    "command-nightly": {
        "max_tokens": 4096, 
        "max_input_tokens": 4096,
@ -1512,6 +1634,15 @@
        "litellm_provider": "bedrock",
        "mode": "chat"
    },
+    "mistral.mistral-large-2402-v1:0": {
+        "max_tokens": 8191,
+        "max_input_tokens": 32000,
+        "max_output_tokens": 8191,
+        "input_cost_per_token": 0.000008,
+        "output_cost_per_token": 0.000024,
+        "litellm_provider": "bedrock",
+        "mode": "chat"
+    },
    "bedrock/us-west-2/mistral.mixtral-8x7b-instruct-v0:1": {
        "max_tokens": 8191,
        "max_input_tokens": 32000,
@ -1521,7 +1652,25 @@
        "litellm_provider": "bedrock",
        "mode": "chat"
    },
-    "bedrock/us-west-2/mistral.mistral-7b-instruct": {
+    "bedrock/us-east-1/mistral.mixtral-8x7b-instruct-v0:1": {
+        "max_tokens": 8191,
+        "max_input_tokens": 32000,
+        "max_output_tokens": 8191,
+        "input_cost_per_token": 0.00000045,
+        "output_cost_per_token": 0.0000007,
+        "litellm_provider": "bedrock",
+        "mode": "chat"
+    },
+    "bedrock/eu-west-3/mistral.mixtral-8x7b-instruct-v0:1": {
+        "max_tokens": 8191,
+        "max_input_tokens": 32000,
+        "max_output_tokens": 8191,
+        "input_cost_per_token": 0.00000059,
+        "output_cost_per_token": 0.00000091,
+        "litellm_provider": "bedrock",
+        "mode": "chat"
+    },
+    "bedrock/us-west-2/mistral.mistral-7b-instruct-v0:2": {
        "max_tokens": 8191,
        "max_input_tokens": 32000,
        "max_output_tokens": 8191,
@ -1530,6 +1679,51 @@
        "litellm_provider": "bedrock",
        "mode": "chat"
    },
+    "bedrock/us-east-1/mistral.mistral-7b-instruct-v0:2": {
+        "max_tokens": 8191,
+        "max_input_tokens": 32000,
+        "max_output_tokens": 8191,
+        "input_cost_per_token": 0.00000015,
+        "output_cost_per_token": 0.0000002,
+        "litellm_provider": "bedrock",
+        "mode": "chat"
+    },
+    "bedrock/eu-west-3/mistral.mistral-7b-instruct-v0:2": {
+        "max_tokens": 8191,
+        "max_input_tokens": 32000,
+        "max_output_tokens": 8191,
+        "input_cost_per_token": 0.0000002,
+        "output_cost_per_token": 0.00000026,
+        "litellm_provider": "bedrock",
+        "mode": "chat"
+    },
+    "bedrock/us-east-1/mistral.mistral-large-2402-v1:0": {
+        "max_tokens": 8191,
+        "max_input_tokens": 32000,
+        "max_output_tokens": 8191,
+        "input_cost_per_token": 0.000008,
+        "output_cost_per_token": 0.000024,
+        "litellm_provider": "bedrock",
+        "mode": "chat"
+    },
+    "bedrock/us-west-2/mistral.mistral-large-2402-v1:0": {
+        "max_tokens": 8191,
+        "max_input_tokens": 32000,
+        "max_output_tokens": 8191,
+        "input_cost_per_token": 0.000008,
+        "output_cost_per_token": 0.000024,
+        "litellm_provider": "bedrock",
+        "mode": "chat"
+    },
+    "bedrock/eu-west-3/mistral.mistral-large-2402-v1:0": {
+        "max_tokens": 8191,
+        "max_input_tokens": 32000,
+        "max_output_tokens": 8191,
+        "input_cost_per_token": 0.0000104,
+        "output_cost_per_token": 0.0000312,
+        "litellm_provider": "bedrock",
+        "mode": "chat"
+    },
    "anthropic.claude-3-sonnet-20240229-v1:0": {
        "max_tokens": 4096, 
        "max_input_tokens": 200000,
@ -1548,6 +1742,15 @@
        "litellm_provider": "bedrock",
        "mode": "chat"
    },
+    "anthropic.claude-3-opus-20240229-v1:0": {
+        "max_tokens": 4096,
+        "max_input_tokens": 200000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.000015,
+        "output_cost_per_token": 0.000075,
+        "litellm_provider": "bedrock",
+        "mode": "chat"
+    },
    "anthropic.claude-v1": {
        "max_tokens": 8191, 
        "max_input_tokens": 100000,
@ -2714,6 +2917,46 @@
        "output_cost_per_token": 0.000000,
        "litellm_provider": "voyage",
        "mode": "embedding"
+    },
+    "voyage/voyage-large-2": {
+        "max_tokens": 16000,
+        "max_input_tokens": 16000,
+        "input_cost_per_token": 0.00000012,
+        "output_cost_per_token": 0.000000,
+        "litellm_provider": "voyage",
+        "mode": "embedding"
+    },
+    "voyage/voyage-law-2": {
+        "max_tokens": 16000,
+        "max_input_tokens": 16000,
+        "input_cost_per_token": 0.00000012,
+        "output_cost_per_token": 0.000000,
+        "litellm_provider": "voyage",
+        "mode": "embedding"
+    },
+    "voyage/voyage-code-2": {
+        "max_tokens": 16000,
+        "max_input_tokens": 16000,
+        "input_cost_per_token": 0.00000012,
+        "output_cost_per_token": 0.000000,
+        "litellm_provider": "voyage",
+        "mode": "embedding"
+    },
+    "voyage/voyage-2": {
+        "max_tokens": 4000,
+        "max_input_tokens": 4000,
+        "input_cost_per_token": 0.0000001,
+        "output_cost_per_token": 0.000000,
+        "litellm_provider": "voyage",
+        "mode": "embedding"
+    },
+    "voyage/voyage-lite-02-instruct": {
+        "max_tokens": 4000,
+        "max_input_tokens": 4000,
+        "input_cost_per_token": 0.0000001,
+        "output_cost_per_token": 0.000000,
+        "litellm_provider": "voyage",
+        "mode": "embedding"
    }

 }
--- a/litellm/proxy/_experimental/out/404.html
+++ b/litellm/proxy/_experimental/out/404.html
--- a/litellm/proxy/_experimental/out/_next/static/chunks/289-04be6cb9636840d2.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/289-04be6cb9636840d2.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/69-442a9c01c3fd20f9.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/69-442a9c01c3fd20f9.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/_not-found-28e7651bceba9aa1.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/_not-found-28e7651bceba9aa1.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/layout-7221222f6368e429.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/layout-7221222f6368e429.js
@ -1 +0,0 @@
-(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[185],{87421:function(n,e,t){Promise.resolve().then(t.t.bind(t,99646,23)),Promise.resolve().then(t.t.bind(t,63385,23))},63385:function(){},99646:function(n){n.exports={style:{fontFamily:"'__Inter_c23dc8', '__Inter_Fallback_c23dc8'",fontStyle:"normal"},className:"__className_c23dc8"}}},function(n){n.O(0,[971,69,744],function(){return n(n.s=87421)}),_N_E=n.O()}]);
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/layout-e39a5bba6cc62252.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/layout-e39a5bba6cc62252.js
@ -1 +1 @@
-(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[185],{87421:function(n,e,t){Promise.resolve().then(t.t.bind(t,99646,23)),Promise.resolve().then(t.t.bind(t,63385,23))},63385:function(){},99646:function(n){n.exports={style:{fontFamily:"'__Inter_c23dc8', '__Inter_Fallback_c23dc8'",fontStyle:"normal"},className:"__className_c23dc8"}}},function(n){n.O(0,[971,69,744],function(){return n(n.s=87421)}),_N_E=n.O()}]);
+(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[185],{11837:function(n,e,t){Promise.resolve().then(t.t.bind(t,99646,23)),Promise.resolve().then(t.t.bind(t,63385,23))},63385:function(){},99646:function(n){n.exports={style:{fontFamily:"'__Inter_c23dc8', '__Inter_Fallback_c23dc8'",fontStyle:"normal"},className:"__className_c23dc8"}}},function(n){n.O(0,[971,69,744],function(){return n(n.s=11837)}),_N_E=n.O()}]);
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/page-15d0c6c10d700825.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/page-15d0c6c10d700825.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/page-3b94de7ffed5f02e.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/page-3b94de7ffed5f02e.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/fd9d1056-bcf69420342937de.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/fd9d1056-bcf69420342937de.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/main-app-096338c8e1915716.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/main-app-096338c8e1915716.js
@ -1 +1 @@
-(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[744],{32028:function(e,n,t){Promise.resolve().then(t.t.bind(t,47690,23)),Promise.resolve().then(t.t.bind(t,48955,23)),Promise.resolve().then(t.t.bind(t,5613,23)),Promise.resolve().then(t.t.bind(t,11902,23)),Promise.resolve().then(t.t.bind(t,31778,23)),Promise.resolve().then(t.t.bind(t,77831,23))}},function(e){var n=function(n){return e(e.s=n)};e.O(0,[971,69],function(){return n(35317),n(32028)}),_N_E=e.O()}]);
+(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[744],{70377:function(e,n,t){Promise.resolve().then(t.t.bind(t,47690,23)),Promise.resolve().then(t.t.bind(t,48955,23)),Promise.resolve().then(t.t.bind(t,5613,23)),Promise.resolve().then(t.t.bind(t,11902,23)),Promise.resolve().then(t.t.bind(t,31778,23)),Promise.resolve().then(t.t.bind(t,77831,23))}},function(e){var n=function(n){return e(e.s=n)};e.O(0,[971,69],function(){return n(35317),n(70377)}),_N_E=e.O()}]);
--- a/litellm/proxy/_experimental/out/_next/static/chunks/main-app-9b4fb13a7db53edf.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/main-app-9b4fb13a7db53edf.js
@ -1 +0,0 @@
-(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[744],{32028:function(e,n,t){Promise.resolve().then(t.t.bind(t,47690,23)),Promise.resolve().then(t.t.bind(t,48955,23)),Promise.resolve().then(t.t.bind(t,5613,23)),Promise.resolve().then(t.t.bind(t,11902,23)),Promise.resolve().then(t.t.bind(t,31778,23)),Promise.resolve().then(t.t.bind(t,77831,23))}},function(e){var n=function(n){return e(e.s=n)};e.O(0,[971,69],function(){return n(35317),n(32028)}),_N_E=e.O()}]);
--- a/litellm/proxy/_experimental/out/_next/static/chunks/webpack-59f93936973f5f5a.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/webpack-59f93936973f5f5a.js
@ -1 +1 @@
-!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function d(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={id:e,loaded:!1,exports:{}},r=!0;try{a[e](n,n.exports,d),r=!1}finally{r&&delete l[e]}return n.loaded=!0,n.exports}d.m=a,e=[],d.O=function(t,n,r,o){if(n){o=o||0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(d.O).every(function(e){return d.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},d.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return d.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},d.t=function(e,r){if(1&r&&(e=this(e)),8&r||"object"==typeof e&&e&&(4&r&&e.__esModule||16&r&&"function"==typeof e.then))return e;var o=Object.create(null);d.r(o);var u={};t=t||[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},d.d(o,u),o},d.d=function(e,t){for(var n in t)d.o(t,n)&&!d.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},d.f={},d.e=function(e){return Promise.all(Object.keys(d.f).reduce(function(t,n){return d.f[n](e,t),t},[]))},d.u=function(e){},d.miniCssF=function(e){return"static/css/30c9f28a6132bc6e.css"},d.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||Function("return this")()}catch(e){if("object"==typeof window)return window}}(),d.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",d.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e||l.getAttribute("data-webpack")==o+n){i=l;break}}i||(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,d.nc&&i.setAttribute("nonce",d.nc),i.setAttribute("data-webpack",o+n),i.src=d.tu(e)),r[e]=[t];var s=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(s.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=s.bind(null,i.onerror),i.onload=s.bind(null,i.onload),c&&document.head.appendChild(i)},d.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.nmd=function(e){return e.paths=[],e.children||(e.children=[]),e},d.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},d.tu=function(e){return d.tt().createScriptURL(e)},d.p="/ui/_next/",i={272:0},d.f.j=function(e,t){var n=d.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=d.p+d.u(e),u=Error();d.l(o,function(t){if(d.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},d.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)d.o(u,n)&&(d.m[n]=u[n]);if(c)var a=c(d)}for(e&&e(t);f<o.length;f++)r=o[f],d.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return d.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E||[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();
+!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function d(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={id:e,loaded:!1,exports:{}},r=!0;try{a[e](n,n.exports,d),r=!1}finally{r&&delete l[e]}return n.loaded=!0,n.exports}d.m=a,e=[],d.O=function(t,n,r,o){if(n){o=o||0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(d.O).every(function(e){return d.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},d.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return d.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},d.t=function(e,r){if(1&r&&(e=this(e)),8&r||"object"==typeof e&&e&&(4&r&&e.__esModule||16&r&&"function"==typeof e.then))return e;var o=Object.create(null);d.r(o);var u={};t=t||[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},d.d(o,u),o},d.d=function(e,t){for(var n in t)d.o(t,n)&&!d.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},d.f={},d.e=function(e){return Promise.all(Object.keys(d.f).reduce(function(t,n){return d.f[n](e,t),t},[]))},d.u=function(e){},d.miniCssF=function(e){return"static/css/11cfce8bfdf6e8f1.css"},d.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||Function("return this")()}catch(e){if("object"==typeof window)return window}}(),d.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",d.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e||l.getAttribute("data-webpack")==o+n){i=l;break}}i||(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,d.nc&&i.setAttribute("nonce",d.nc),i.setAttribute("data-webpack",o+n),i.src=d.tu(e)),r[e]=[t];var s=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(s.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=s.bind(null,i.onerror),i.onload=s.bind(null,i.onload),c&&document.head.appendChild(i)},d.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.nmd=function(e){return e.paths=[],e.children||(e.children=[]),e},d.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},d.tu=function(e){return d.tt().createScriptURL(e)},d.p="/ui/_next/",i={272:0},d.f.j=function(e,t){var n=d.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=d.p+d.u(e),u=Error();d.l(o,function(t){if(d.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},d.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)d.o(u,n)&&(d.m[n]=u[n]);if(c)var a=c(d)}for(e&&e(t);f<o.length;f++)r=o[f],d.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return d.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E||[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();
--- a/litellm/proxy/_experimental/out/_next/static/css/11cfce8bfdf6e8f1.css
+++ b/litellm/proxy/_experimental/out/_next/static/css/11cfce8bfdf6e8f1.css
--- a/litellm/proxy/_experimental/out/_next/static/css/30c9f28a6132bc6e.css
+++ b/litellm/proxy/_experimental/out/_next/static/css/30c9f28a6132bc6e.css
--- a/litellm/proxy/_experimental/out/_next/static/fcTpSzljtxsSagYnqnMB2/_buildManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/fcTpSzljtxsSagYnqnMB2/_buildManifest.js
--- a/litellm/proxy/_experimental/out/_next/static/fcTpSzljtxsSagYnqnMB2/_ssgManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/fcTpSzljtxsSagYnqnMB2/_ssgManifest.js
--- a/litellm/proxy/_experimental/out/index.html
+++ b/litellm/proxy/_experimental/out/index.html
@ -1 +1 @@
-<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-0b24ae51efb18f55.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-a507ee9e75a3be72.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-589b47e7a69d316f.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-0b24ae51efb18f55.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/30c9f28a6132bc6e.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[93838,[\"886\",\"static/chunks/886-12db202b96df83ff.js\",\"931\",\"static/chunks/app/page-3b94de7ffed5f02e.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/30c9f28a6132bc6e.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"QiWWLSM3qA6xSFhVxek_e\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
+<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-59f93936973f5f5a.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-bcf69420342937de.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-442a9c01c3fd20f9.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-096338c8e1915716.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-59f93936973f5f5a.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/11cfce8bfdf6e8f1.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[8251,[\"289\",\"static/chunks/289-04be6cb9636840d2.js\",\"931\",\"static/chunks/app/page-15d0c6c10d700825.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/11cfce8bfdf6e8f1.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"fcTpSzljtxsSagYnqnMB2\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
--- a/litellm/proxy/_experimental/out/index.txt
+++ b/litellm/proxy/_experimental/out/index.txt
@ -1,7 +1,7 @@
 2:I[77831,[],""]
-3:I[93838,["886","static/chunks/886-12db202b96df83ff.js","931","static/chunks/app/page-3b94de7ffed5f02e.js"],""]
+3:I[8251,["289","static/chunks/289-04be6cb9636840d2.js","931","static/chunks/app/page-15d0c6c10d700825.js"],""]
 4:I[5613,[],""]
 5:I[31778,[],""]
-0:["QiWWLSM3qA6xSFhVxek_e",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/30c9f28a6132bc6e.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
+0:["fcTpSzljtxsSagYnqnMB2",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/11cfce8bfdf6e8f1.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
 6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
 1:null
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@ -4,29 +4,48 @@ model_list:
    model: openai/my-fake-model
    api_key: my-fake-key
    api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
+    stream_timeout: 0.001
+    rpm: 10
+- litellm_params:
+      model: azure/chatgpt-v-2
+      api_base: os.environ/AZURE_API_BASE
+      api_key: os.environ/AZURE_API_KEY
+      api_version: "2023-07-01-preview"
+      stream_timeout: 0.001
+  model_name: azure-gpt-3.5
+# - model_name: text-embedding-ada-002
+#   litellm_params:
+#     model: text-embedding-ada-002
+#     api_key: os.environ/OPENAI_API_KEY
 - model_name: gpt-instruct
  litellm_params:
-    model: gpt-3.5-turbo-instruct
+    model: text-completion-openai/gpt-3.5-turbo-instruct
    # api_key: my-fake-key
    # api_base: https://exampleopenaiendpoint-production.up.railway.app/

 litellm_settings:
  success_callback: ["prometheus"]
+  service_callback: ["prometheus_system"]
+  upperbound_key_generate_params: 
+    max_budget: os.environ/LITELLM_UPPERBOUND_KEYS_MAX_BUDGET

-# litellm_settings:
-#   drop_params: True
-#   max_budget: 800021
-#   budget_duration: 30d
-#   # cache: true
-  
+router_settings:
+  routing_strategy: usage-based-routing-v2 
+  redis_host: os.environ/REDIS_HOST
+  redis_password: os.environ/REDIS_PASSWORD
+  redis_port: os.environ/REDIS_PORT
+  enable_pre_call_checks: True

 general_settings:
  master_key: sk-1234
+  allow_user_auth: true
  alerting: ["slack"]
-  store_model_in_db: True
-  # proxy_batch_write_at: 60 # 👈 Frequency of batch writing logs to server (in seconds)
+  store_model_in_db: True // set via environment variable - os.environ["STORE_MODEL_IN_DB"] = "True"
+  proxy_batch_write_at: 5 # 👈 Frequency of batch writing logs to server (in seconds)
  enable_jwt_auth: True
  alerting: ["slack"]
  litellm_jwtauth:
    admin_jwt_scope: "litellm_proxy_admin"
-    public_key_ttl: 600
+    public_key_ttl: os.environ/LITELLM_PUBLIC_KEY_TTL
+    user_id_jwt_field: "sub"
+    org_id_jwt_field: "azp"
--- a/litellm/proxy/_types.py
+++ b/litellm/proxy/_types.py
@ -38,6 +38,18 @@ class LiteLLMBase(BaseModel):
        protected_namespaces = ()


+class LiteLLM_UpperboundKeyGenerateParams(LiteLLMBase):
+    """
+    Set default upperbound to max budget a key called via `/key/generate` can be.
+    """
+
+    max_budget: Optional[float] = None
+    budget_duration: Optional[str] = None
+    max_parallel_requests: Optional[int] = None
+    tpm_limit: Optional[int] = None
+    rpm_limit: Optional[int] = None
+
+
 class LiteLLMRoutes(enum.Enum):
    openai_routes: List = [  # chat completions
        "/openai/deployments/{model}/chat/completions",
@ -91,6 +103,26 @@ class LiteLLMRoutes(enum.Enum):
        "/model/info",
    ]

+    spend_tracking_routes: List = [
+        # spend
+        "/spend/keys",
+        "/spend/users",
+        "/spend/tags",
+        "/spend/calculate",
+        "/spend/logs",
+    ]
+
+    global_spend_tracking_routes: List = [
+        # global spend
+        "/global/spend/logs",
+        "/global/spend",
+        "/global/spend/keys",
+        "/global/spend/teams",
+        "/global/spend/end_users",
+        "/global/spend/models",
+        "/global/predict/spend/logs",
+    ]
+
    public_routes: List = [
        "/routes",
        "/",
@ -102,6 +134,18 @@ class LiteLLMRoutes(enum.Enum):
    ]


+# class LiteLLMAllowedRoutes(LiteLLMBase):
+#     """
+#     Defines allowed routes based on key type.
+
+#     Types = ["admin", "team", "user", "unmapped"]
+#     """
+
+#     admin_allowed_routes: List[
+#         Literal["openai_routes", "info_routes", "management_routes", "spend_tracking_routes", "global_spend_tracking_routes"]
+#     ] = ["management_routes"]
+
+
 class LiteLLM_JWTAuth(LiteLLMBase):
    """
    A class to define the roles and permissions for a LiteLLM Proxy w/ JWT Auth.
@ -112,7 +156,8 @@ class LiteLLM_JWTAuth(LiteLLMBase):
    - team_jwt_scope: The JWT scope required for proxy team roles.
    - team_id_jwt_field: The field in the JWT token that stores the team ID. Default - `client_id`.
    - team_allowed_routes: list of allowed routes for proxy team roles.
-    - end_user_id_jwt_field: Default - `sub`. The field in the JWT token that stores the end-user ID. Turn this off by setting to `None`. Enables end-user cost tracking.
+    - user_id_jwt_field: The field in the JWT token that stores the user id (maps to `LiteLLMUserTable`). Use this for internal employees.
+    - end_user_id_jwt_field: The field in the JWT token that stores the end-user ID (maps to `LiteLLMEndUserTable`). Turn this off by setting to `None`. Enables end-user cost tracking. Use this for external customers.
    - public_key_ttl: Default - 600s. TTL for caching public JWT keys.

    See `auth_checks.py` for the specific routes
@ -127,7 +172,9 @@ class LiteLLM_JWTAuth(LiteLLMBase):
    team_allowed_routes: List[
        Literal["openai_routes", "info_routes", "management_routes"]
    ] = ["openai_routes", "info_routes"]
-    end_user_id_jwt_field: Optional[str] = "sub"
+    org_id_jwt_field: Optional[str] = None
+    user_id_jwt_field: Optional[str] = None
+    end_user_id_jwt_field: Optional[str] = None
    public_key_ttl: float = 600

    def __init__(self, **kwargs: Any) -> None:
@ -363,6 +410,11 @@ class NewUserRequest(GenerateKeyRequest):
    max_budget: Optional[float] = None
    user_email: Optional[str] = None
    user_role: Optional[str] = None
+    teams: Optional[list] = None
+    organization_id: Optional[str] = None
+    auto_create_key: bool = (
+        True  # flag used for returning a key as part of the /user/new response
+    )


 class NewUserResponse(GenerateKeyResponse):
@ -438,8 +490,16 @@ class TeamMemberDeleteRequest(LiteLLMBase):
        return values


-class UpdateTeamRequest(TeamBase):
+class UpdateTeamRequest(LiteLLMBase):
    team_id: str  # required
+    team_alias: Optional[str] = None
+    organization_id: Optional[str] = None
+    metadata: Optional[dict] = None
+    tpm_limit: Optional[int] = None
+    rpm_limit: Optional[int] = None
+    max_budget: Optional[float] = None
+    models: Optional[list] = None
+    blocked: Optional[bool] = None


 class DeleteTeamRequest(LiteLLMBase):
@ -495,6 +555,7 @@ class LiteLLM_BudgetTable(LiteLLMBase):


 class NewOrganizationRequest(LiteLLM_BudgetTable):
+    organization_id: Optional[str] = None
    organization_alias: str
    models: List = []
    budget_id: Optional[str] = None
@ -503,6 +564,7 @@ class NewOrganizationRequest(LiteLLM_BudgetTable):
 class LiteLLM_OrganizationTable(LiteLLMBase):
    """Represents user-controllable params for a LiteLLM_OrganizationTable record"""

+    organization_id: Optional[str] = None
    organization_alias: Optional[str] = None
    budget_id: str
    metadata: Optional[dict] = None
@ -687,6 +749,8 @@ class LiteLLM_VerificationToken(LiteLLMBase):
    soft_budget_cooldown: bool = False
    litellm_budget_table: Optional[dict] = None

+    org_id: Optional[str] = None  # org id for a given key
+
    # hidden params used for parallel request limiting, not required to create a token
    user_id_rate_limits: Optional[dict] = None
    team_id_rate_limits: Optional[dict] = None
--- a/litellm/proxy/auth/auth_checks.py
+++ b/litellm/proxy/auth/auth_checks.py
@ -14,6 +14,7 @@ from litellm.proxy._types import (
    LiteLLM_JWTAuth,
    LiteLLM_TeamTable,
    LiteLLMRoutes,
+    LiteLLM_OrganizationTable,
 )
 from typing import Optional, Literal, Union
 from litellm.proxy.utils import PrismaClient
@ -26,6 +27,7 @@ all_routes = LiteLLMRoutes.openai_routes.value + LiteLLMRoutes.management_routes
 def common_checks(
    request_body: dict,
    team_object: LiteLLM_TeamTable,
+    user_object: Optional[LiteLLM_UserTable],
    end_user_object: Optional[LiteLLM_EndUserTable],
    global_proxy_spend: Optional[float],
    general_settings: dict,
@ -37,7 +39,8 @@ def common_checks(
    1. If team is blocked
    2. If team can call model
    3. If team is in budget
-    4. If end_user ('user' passed to /chat/completions, /embeddings endpoint) is in budget
+    5. If user passed in (JWT or key.user_id) - is in budget
+    4. If end_user (either via JWT or 'user' passed to /chat/completions, /embeddings endpoint) is in budget
    5. [OPTIONAL] If 'enforce_end_user' enabled - did developer pass in 'user' param for openai endpoints
    6. [OPTIONAL] If 'litellm.max_budget' is set (>0), is proxy under budget
    """
@ -69,14 +72,20 @@ def common_checks(
        raise Exception(
            f"Team={team_object.team_id} over budget. Spend={team_object.spend}, Budget={team_object.max_budget}"
        )
-    # 4. If end_user ('user' passed to /chat/completions, /embeddings endpoint) is in budget
+    if user_object is not None and user_object.max_budget is not None:
+        user_budget = user_object.max_budget
+        if user_budget > user_object.spend:
+            raise Exception(
+                f"ExceededBudget: User={user_object.user_id} over budget. Spend={user_object.spend}, Budget={user_budget}"
+            )
+    # 5. If end_user ('user' passed to /chat/completions, /embeddings endpoint) is in budget
    if end_user_object is not None and end_user_object.litellm_budget_table is not None:
        end_user_budget = end_user_object.litellm_budget_table.max_budget
        if end_user_budget is not None and end_user_object.spend > end_user_budget:
            raise Exception(
                f"ExceededBudget: End User={end_user_object.user_id} over budget. Spend={end_user_object.spend}, Budget={end_user_budget}"
            )
-    # 5. [OPTIONAL] If 'enforce_user_param' enabled - did developer pass in 'user' param for openai endpoints
+    # 6. [OPTIONAL] If 'enforce_user_param' enabled - did developer pass in 'user' param for openai endpoints
    if (
        general_settings.get("enforce_user_param", None) is not None
        and general_settings["enforce_user_param"] == True
@ -85,7 +94,7 @@ def common_checks(
            raise Exception(
                f"'user' param not passed in. 'enforce_user_param'={general_settings['enforce_user_param']}"
            )
-    # 6. [OPTIONAL] If 'litellm.max_budget' is set (>0), is proxy under budget
+    # 7. [OPTIONAL] If 'litellm.max_budget' is set (>0), is proxy under budget
    if litellm.max_budget > 0 and global_proxy_spend is not None:
        if global_proxy_spend > litellm.max_budget:
            raise Exception(
@ -95,6 +104,13 @@ def common_checks(


 def _allowed_routes_check(user_route: str, allowed_routes: list) -> bool:
+    """
+    Return if a user is allowed to access route. Helper function for `allowed_routes_check`.
+
+    Parameters:
+    - user_route: str - the route the user is trying to call
+    - allowed_routes: List[str|LiteLLMRoutes] - the list of allowed routes for the user.
+    """
    for allowed_route in allowed_routes:
        if (
            allowed_route == LiteLLMRoutes.openai_routes.name
@ -117,7 +133,7 @@ def _allowed_routes_check(user_route: str, allowed_routes: list) -> bool:


 def allowed_routes_check(
-    user_role: Literal["proxy_admin", "team"],
+    user_role: Literal["proxy_admin", "team", "user"],
    user_route: str,
    litellm_proxy_roles: LiteLLM_JWTAuth,
 ) -> bool:
@ -204,19 +220,24 @@ async def get_end_user_object(
        return None


-async def get_user_object(self, user_id: str) -> LiteLLM_UserTable:
+async def get_user_object(
+    user_id: str,
+    prisma_client: Optional[PrismaClient],
+    user_api_key_cache: DualCache,
+) -> Optional[LiteLLM_UserTable]:
    """
    - Check if user id in proxy User Table
    - if valid, return LiteLLM_UserTable object with defined limits
    - if not, then raise an error
    """
-    if self.prisma_client is None:
-        raise Exception(
-            "No DB Connected. See - https://docs.litellm.ai/docs/proxy/virtual_keys"
-        )
+    if prisma_client is None:
+        raise Exception("No db connected")
+
+    if user_id is None:
+        return None

    # check if in cache
-    cached_user_obj = self.user_api_key_cache.async_get_cache(key=user_id)
+    cached_user_obj = user_api_key_cache.async_get_cache(key=user_id)
    if cached_user_obj is not None:
        if isinstance(cached_user_obj, dict):
            return LiteLLM_UserTable(**cached_user_obj)
@ -224,7 +245,7 @@ async def get_user_object(self, user_id: str) -> LiteLLM_UserTable:
            return cached_user_obj
    # else, check db
    try:
-        response = await self.prisma_client.db.litellm_usertable.find_unique(
+        response = await prisma_client.db.litellm_usertable.find_unique(
            where={"user_id": user_id}
        )

@ -232,9 +253,9 @@ async def get_user_object(self, user_id: str) -> LiteLLM_UserTable:
            raise Exception

        return LiteLLM_UserTable(**response.dict())
-    except Exception as e:
+    except Exception as e:  # if end-user not in db
        raise Exception(
-            f"User doesn't exist in db. User={user_id}. Create user via `/user/new` call."
+            f"User doesn't exist in db. 'user_id'={user_id}. Create user via `/user/new` call."
        )


@ -274,3 +295,41 @@ async def get_team_object(
        raise Exception(
            f"Team doesn't exist in db. Team={team_id}. Create team via `/team/new` call."
        )
+
+
+async def get_org_object(
+    org_id: str,
+    prisma_client: Optional[PrismaClient],
+    user_api_key_cache: DualCache,
+):
+    """
+    - Check if org id in proxy Org Table
+    - if valid, return LiteLLM_OrganizationTable object
+    - if not, then raise an error
+    """
+    if prisma_client is None:
+        raise Exception(
+            "No DB Connected. See - https://docs.litellm.ai/docs/proxy/virtual_keys"
+        )
+
+    # check if in cache
+    cached_org_obj = user_api_key_cache.async_get_cache(key="org_id:{}".format(org_id))
+    if cached_org_obj is not None:
+        if isinstance(cached_org_obj, dict):
+            return cached_org_obj
+        elif isinstance(cached_org_obj, LiteLLM_OrganizationTable):
+            return cached_org_obj
+    # else, check db
+    try:
+        response = await prisma_client.db.litellm_organizationtable.find_unique(
+            where={"organization_id": org_id}
+        )
+
+        if response is None:
+            raise Exception
+
+        return response
+    except Exception as e:
+        raise Exception(
+            f"Organization doesn't exist in db. Organization={org_id}. Create organization via `/organization/new` call."
+        )
--- a/litellm/proxy/auth/handle_jwt.py
+++ b/litellm/proxy/auth/handle_jwt.py
@ -74,6 +74,26 @@ class JWTHandler:
            team_id = default_value
        return team_id

+    def get_user_id(self, token: dict, default_value: Optional[str]) -> Optional[str]:
+        try:
+            if self.litellm_jwtauth.user_id_jwt_field is not None:
+                user_id = token[self.litellm_jwtauth.user_id_jwt_field]
+            else:
+                user_id = None
+        except KeyError:
+            user_id = default_value
+        return user_id
+
+    def get_org_id(self, token: dict, default_value: Optional[str]) -> Optional[str]:
+        try:
+            if self.litellm_jwtauth.org_id_jwt_field is not None:
+                org_id = token[self.litellm_jwtauth.org_id_jwt_field]
+            else:
+                org_id = None
+        except KeyError:
+            org_id = default_value
+        return org_id
+
    def get_scopes(self, token: dict) -> list:
        try:
            if isinstance(token["scope"], str):
@ -101,7 +121,11 @@ class JWTHandler:
        if cached_keys is None:
            response = await self.http_handler.get(keys_url)

+            response_json = response.json()
+            if "keys" in response_json:
                keys = response.json()["keys"]
+            else:
+                keys = response_json

            await self.user_api_key_cache.async_set_cache(
                key="litellm_jwt_auth_keys",
--- a/litellm/proxy/health_check.py
+++ b/litellm/proxy/health_check.py
@ -86,7 +86,12 @@ async def perform_health_check(
            return [], []

    if model is not None:
-        model_list = [x for x in model_list if x["litellm_params"]["model"] == model]
+        _new_model_list = [
+            x for x in model_list if x["litellm_params"]["model"] == model
+        ]
+        if _new_model_list == []:
+            _new_model_list = [x for x in model_list if x["model_name"] == model]
+        model_list = _new_model_list

    healthy_endpoints, unhealthy_endpoints = await _perform_health_check(model_list)

--- a/litellm/proxy/hooks/batch_redis_get.py
+++ b/litellm/proxy/hooks/batch_redis_get.py
@ -79,7 +79,7 @@ class _PROXY_BatchRedisRequests(CustomLogger):
                    self.print_verbose(f"redis keys: {keys}")
                    if len(keys) > 0:
                        key_value_dict = (
-                            await litellm.cache.cache.async_get_cache_pipeline(
+                            await litellm.cache.cache.async_batch_get_cache(
                                key_list=keys
                            )
                        )
--- a/litellm/proxy/proxy_cli.py
+++ b/litellm/proxy/proxy_cli.py
@ -425,9 +425,10 @@ def run_server(
                )

            proxy_config = ProxyConfig()
-            _, _, general_settings = asyncio.run(
-                proxy_config.load_config(router=None, config_file_path=config)
-            )
+            _config = asyncio.run(proxy_config.get_config(config_file_path=config))
+            general_settings = _config.get("general_settings", {})
+            if general_settings is None:
+                general_settings = {}
            database_url = general_settings.get("database_url", None)
            db_connection_pool_limit = general_settings.get(
                "database_connection_pool_limit", 100
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@ -1,49 +1,26 @@
 model_list:
-
-  - model_name: gpt-3.5-turbo
-    litellm_params:
-      model: azure/chatgpt-v-2
-      api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
-      api_version: "2023-05-15"
-      api_key: os.environ/AZURE_API_KEY # The `os.environ/` prefix tells litellm to read this from the env. See https://docs.litellm.ai/docs/simple_proxy#load-api-keys-from-vault
-  - model_name: gpt-3.5-turbo-large
-    litellm_params: 
-      model: "gpt-3.5-turbo-1106"
-      api_key: os.environ/OPENAI_API_KEY
-  - model_name: gpt-4
-    litellm_params:
-      model: azure/chatgpt-v-2
-      api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
-      api_version: "2023-05-15"
-      api_key: os.environ/AZURE_API_KEY # The `os.environ/` prefix tells litellm to read this from the env. See https://docs.litellm.ai/docs/simple_proxy#load-api-keys-from-vault
-  - model_name: sagemaker-completion-model
-    litellm_params:
-      model: sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4
-      input_cost_per_second: 0.000420  
-  - model_name: text-embedding-ada-002
-    litellm_params: 
-      model: azure/azure-embedding-model
-      api_key: os.environ/AZURE_API_KEY
-      api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
-      api_version: "2023-05-15"
-    model_info:
-      mode: embedding
-      base_model: text-embedding-ada-002
-  - model_name: dall-e-2
-    litellm_params:
-      model: azure/
-      api_version: 2023-06-01-preview
-      api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
-      api_key: os.environ/AZURE_API_KEY
-  - model_name: openai-dall-e-3
-    litellm_params:
-      model: dall-e-3
  - model_name: fake-openai-endpoint
    litellm_params:
      model: openai/fake
      api_key: fake-key
      api_base: https://exampleopenaiendpoint-production.up.railway.app/
+  - model_name: "*"
+    litellm_params:
+      model: openai/*
+      api_key: os.environ/OPENAI_API_KEY
+
+
 litellm_settings:
-  success_callback: ["prometheus"]
+  default_team_settings: 
+    - team_id: team-1
+      success_callback: ["langfuse"]
+      langfuse_public_key: os.environ/LANGFUSE_PROJECT1_PUBLIC # Project 1
+      langfuse_secret: os.environ/LANGFUSE_PROJECT1_SECRET # Project 1
+    - team_id: team-2
+      success_callback: ["langfuse"]
+      langfuse_public_key: os.environ/LANGFUSE_PROJECT2_PUBLIC # Project 2
+      langfuse_secret: os.environ/LANGFUSE_PROJECT2_SECRET # Project 2
+
 general_settings:
+  store_model_in_db: true
  master_key: sk-1234
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
--- a/litellm/proxy/schema.prisma
+++ b/litellm/proxy/schema.prisma
@ -39,7 +39,6 @@ model LiteLLM_ProxyModelTable {
  updated_by String
 }

-
 model LiteLLM_OrganizationTable {
 		organization_id String @id @default(uuid())
    organization_alias  String
@ -54,6 +53,7 @@ model LiteLLM_OrganizationTable {
    updated_by String
    litellm_budget_table LiteLLM_BudgetTable?   @relation(fields: [budget_id], references: [budget_id])
    teams LiteLLM_TeamTable[] 
+    users LiteLLM_UserTable[]
 }

 // Model info for teams, just has model aliases for now.
@ -98,7 +98,9 @@ model LiteLLM_TeamTable {
 // Track spend, rate limit, budget Users
 model LiteLLM_UserTable {
 		user_id    String @id
+    user_alias String? 
    team_id    String?
+    organization_id String?
    teams    String[] @default([])
    user_role  String?
 		max_budget Float?
@ -113,6 +115,7 @@ model LiteLLM_UserTable {
    allowed_cache_controls String[] @default([])
    model_spend      Json @default("{}")
    model_max_budget Json @default("{}")
+    litellm_organization_table LiteLLM_OrganizationTable?   @relation(fields: [organization_id], references: [organization_id])
 }

 // Generate Tokens for Proxy
--- a/litellm/proxy/tests/test_openai_embedding.py
+++ b/litellm/proxy/tests/test_openai_embedding.py
@ -0,0 +1,126 @@
+import openai
+import asyncio
+
+
+async def async_request(client, model, input_data):
+    response = await client.embeddings.create(model=model, input=input_data)
+    response = response.dict()
+    data_list = response["data"]
+    for i, embedding in enumerate(data_list):
+        embedding["embedding"] = []
+        current_index = embedding["index"]
+        assert i == current_index
+    return response
+
+
+async def main():
+    client = openai.AsyncOpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000")
+    models = [
+        "text-embedding-ada-002",
+        "text-embedding-ada-002",
+        "text-embedding-ada-002",
+    ]
+    inputs = [
+        [
+            "5",
+            "6",
+            "7",
+            "8",
+            "9",
+            "10",
+            "11",
+            "12",
+            "13",
+            "14",
+            "15",
+            "16",
+            "17",
+            "18",
+            "19",
+            "20",
+        ],
+        ["1", "2", "3", "4", "5", "6"],
+        [
+            "1",
+            "2",
+            "3",
+            "4",
+            "5",
+            "6",
+            "7",
+            "8",
+            "9",
+            "10",
+            "11",
+            "12",
+            "13",
+            "14",
+            "15",
+            "16",
+            "17",
+            "18",
+            "19",
+            "20",
+        ],
+        [
+            "1",
+            "2",
+            "3",
+            "4",
+            "5",
+            "6",
+            "7",
+            "8",
+            "9",
+            "10",
+            "11",
+            "12",
+            "13",
+            "14",
+            "15",
+            "16",
+            "17",
+            "18",
+            "19",
+            "20",
+        ],
+        [
+            "1",
+            "2",
+            "3",
+            "4",
+            "5",
+            "6",
+            "7",
+            "8",
+            "9",
+            "10",
+            "11",
+            "12",
+            "13",
+            "14",
+            "15",
+            "16",
+            "17",
+            "18",
+            "19",
+            "20",
+        ],
+        ["1", "2", "3"],
+    ]
+
+    tasks = []
+    for model, input_data in zip(models, inputs):
+        task = async_request(client, model, input_data)
+        tasks.append(task)
+
+    responses = await asyncio.gather(*tasks)
+    print(responses)
+    for response in responses:
+        data_list = response["data"]
+        for embedding in data_list:
+            embedding["embedding"] = []
+        print(response)
+
+
+asyncio.run(main())
--- a/litellm/proxy/tests/test_openai_simple_embedding.py
+++ b/litellm/proxy/tests/test_openai_simple_embedding.py
@ -0,0 +1,10 @@
+import openai
+
+client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000")
+
+# # request sent to model set on litellm proxy, `litellm --model`
+response = client.embeddings.create(
+    model="text-embedding-ada-002", input=["test"], encoding_format="base64"
+)
+
+print(response)
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@ -182,6 +182,25 @@ class ProxyLogging:
                raise e
        return data

+    def _response_taking_too_long_callback(
+        self,
+        kwargs,  # kwargs to completion
+        start_time,
+        end_time,  # start/end time
+    ):
+        try:
+            time_difference = end_time - start_time
+            # Convert the timedelta to float (in seconds)
+            time_difference_float = time_difference.total_seconds()
+            litellm_params = kwargs.get("litellm_params", {})
+            api_base = litellm_params.get("api_base", "")
+            model = kwargs.get("model", "")
+            messages = kwargs.get("messages", "")
+
+            return time_difference_float, model, api_base, messages
+        except Exception as e:
+            raise e
+
    async def response_taking_too_long_callback(
        self,
        kwargs,  # kwargs to completion
@ -191,13 +210,13 @@ class ProxyLogging:
    ):
        if self.alerting is None:
            return
-        time_difference = end_time - start_time
-        # Convert the timedelta to float (in seconds)
-        time_difference_float = time_difference.total_seconds()
-        litellm_params = kwargs.get("litellm_params", {})
-        api_base = litellm_params.get("api_base", "")
-        model = kwargs.get("model", "")
-        messages = kwargs.get("messages", "")
+        time_difference_float, model, api_base, messages = (
+            self._response_taking_too_long_callback(
+                kwargs=kwargs,
+                start_time=start_time,
+                end_time=end_time,
+            )
+        )
        request_info = f"\nRequest Model: `{model}`\nAPI Base: `{api_base}`\nMessages: `{messages}`"
        slow_message = f"`Responses are slow - {round(time_difference_float,2)}s response time > Alerting threshold: {self.alerting_threshold}s`"
        if time_difference_float > self.alerting_threshold:
@ -244,6 +263,20 @@ class ProxyLogging:
                request_data is not None
                and request_data.get("litellm_status", "") != "success"
            ):
+                if request_data.get("deployment", None) is not None and isinstance(
+                    request_data["deployment"], dict
+                ):
+                    _api_base = litellm.get_api_base(
+                        model=model,
+                        optional_params=request_data["deployment"].get(
+                            "litellm_params", {}
+                        ),
+                    )
+
+                    if _api_base is None:
+                        _api_base = ""
+
+                    request_info += f"\nAPI Base: {_api_base}"
                # only alert hanging responses if they have not been marked as success
                alerting_message = (
                    f"`Requests are hanging - {self.alerting_threshold}s+ request time`"
@ -428,7 +461,12 @@ class ProxyLogging:
        """
        ### ALERTING ###
        if isinstance(original_exception, HTTPException):
+            if isinstance(original_exception.detail, str):
                error_message = original_exception.detail
+            elif isinstance(original_exception.detail, dict):
+                error_message = json.dumps(original_exception.detail)
+            else:
+                error_message = str(original_exception)
        else:
            error_message = str(original_exception)
        if isinstance(traceback_str, str):
@ -529,6 +567,7 @@ class PrismaClient:
    end_user_list_transactons: dict = {}
    key_list_transactons: dict = {}
    team_list_transactons: dict = {}
+    org_list_transactons: dict = {}
    spend_log_transactions: List = []

    def __init__(self, database_url: str, proxy_logging_obj: ProxyLogging):
@ -1126,6 +1165,7 @@ class PrismaClient:
                return new_verification_token
            elif table_name == "user":
                db_data = self.jsonify_object(data=data)
+                try:
                    new_user_row = await self.db.litellm_usertable.upsert(
                        where={"user_id": data["user_id"]},
                        data={
@ -1133,6 +1173,18 @@ class PrismaClient:
                            "update": {},  # don't do anything if it already exists
                        },
                    )
+                except Exception as e:
+                    if (
+                        "Foreign key constraint failed on the field: `LiteLLM_UserTable_organization_id_fkey (index)`"
+                        in str(e)
+                    ):
+                        raise HTTPException(
+                            status_code=400,
+                            detail={
+                                "error": f"Foreign Key Constraint failed. Organization ID={db_data['organization_id']} does not exist in LiteLLM_OrganizationTable. Create via `/organization/new`."
+                            },
+                        )
+                    raise e
                verbose_proxy_logger.info("Data Inserted into User Table")
                return new_user_row
            elif table_name == "team":
@ -2099,6 +2151,46 @@ async def update_spend(
                )
                raise e

+    ### UPDATE ORG TABLE ###
+    if len(prisma_client.org_list_transactons.keys()) > 0:
+        for i in range(n_retry_times + 1):
+            try:
+                async with prisma_client.db.tx(
+                    timeout=timedelta(seconds=60)
+                ) as transaction:
+                    async with transaction.batch_() as batcher:
+                        for (
+                            org_id,
+                            response_cost,
+                        ) in prisma_client.org_list_transactons.items():
+                            batcher.litellm_organizationtable.update_many(  # 'update_many' prevents error from being raised if no row exists
+                                where={"organization_id": org_id},
+                                data={"spend": {"increment": response_cost}},
+                            )
+                prisma_client.org_list_transactons = (
+                    {}
+                )  # Clear the remaining transactions after processing all batches in the loop.
+                break
+            except httpx.ReadTimeout:
+                if i >= n_retry_times:  # If we've reached the maximum number of retries
+                    raise  # Re-raise the last exception
+                # Optionally, sleep for a bit before retrying
+                await asyncio.sleep(2**i)  # Exponential backoff
+            except Exception as e:
+                import traceback
+
+                error_msg = (
+                    f"LiteLLM Prisma Client Exception - update org spend: {str(e)}"
+                )
+                print_verbose(error_msg)
+                error_traceback = error_msg + "\n" + traceback.format_exc()
+                asyncio.create_task(
+                    proxy_logging_obj.failure_handler(
+                        original_exception=e, traceback_str=error_traceback
+                    )
+                )
+                raise e
+
    ### UPDATE SPEND LOGS ###
    verbose_proxy_logger.debug(
        "Spend Logs transactions: {}".format(len(prisma_client.spend_log_transactions))
@ -2182,32 +2274,6 @@ async def update_spend(
                raise e


-# class Models:
-#     """
-#     Need a class to maintain state of models / router across calls to check if new deployments need to be added
-#     """
-
-#     def __init__(
-#         self,
-#         router: litellm.Router,
-#         llm_model_list: list,
-#         prisma_client: PrismaClient,
-#         proxy_logging_obj: ProxyLogging,
-#         master_key: str,
-#     ) -> None:
-#         self.router = router
-#         self.llm_model_list = llm_model_list
-#         self.prisma_client = prisma_client
-#         self.proxy_logging_obj = proxy_logging_obj
-#         self.master_key = master_key
-
-#     def get_router(self) -> litellm.Router:
-#         return self.router
-
-#     def get_model_list(self) -> list:
-#         return self.llm_model_list
-
-
 async def _read_request_body(request):
    """
    Asynchronous function to read the request body and parse it as JSON or literal data.
--- a/litellm/router.py
+++ b/litellm/router.py
@ -11,9 +11,9 @@ import copy, httpx
 from datetime import datetime
 from typing import Dict, List, Optional, Union, Literal, Any, BinaryIO
 import random, threading, time, traceback, uuid
-import litellm, openai
+import litellm, openai, hashlib, json
 from litellm.caching import RedisCache, InMemoryCache, DualCache
-
+import datetime as datetime_og
 import logging, asyncio
 import inspect, concurrent
 from openai import AsyncOpenAI
@ -21,120 +21,16 @@ from collections import defaultdict
 from litellm.router_strategy.least_busy import LeastBusyLoggingHandler
 from litellm.router_strategy.lowest_tpm_rpm import LowestTPMLoggingHandler
 from litellm.router_strategy.lowest_latency import LowestLatencyLoggingHandler
+from litellm.router_strategy.lowest_tpm_rpm_v2 import LowestTPMLoggingHandler_v2
 from litellm.llms.custom_httpx.azure_dall_e_2 import (
    CustomHTTPTransport,
    AsyncCustomHTTPTransport,
 )
-from litellm.utils import ModelResponse, CustomStreamWrapper
+from litellm.utils import ModelResponse, CustomStreamWrapper, get_utc_datetime
 import copy
 from litellm._logging import verbose_router_logger
 import logging
-from pydantic import BaseModel, validator
-
-
-class ModelInfo(BaseModel):
-    id: Optional[
-        str
-    ]  # Allow id to be optional on input, but it will always be present as a str in the model instance
-
-    def __init__(self, id: Optional[Union[str, int]] = None, **params):
-        if id is None:
-            id = str(uuid.uuid4())  # Generate a UUID if id is None or not provided
-        elif isinstance(id, int):
-            id = str(id)
-        super().__init__(id=id, **params)
-
-    class Config:
-        extra = "allow"
-
-    def __contains__(self, key):
-        # Define custom behavior for the 'in' operator
-        return hasattr(self, key)
-
-    def get(self, key, default=None):
-        # Custom .get() method to access attributes with a default value if the attribute doesn't exist
-        return getattr(self, key, default)
-
-    def __getitem__(self, key):
-        # Allow dictionary-style access to attributes
-        return getattr(self, key)
-
-    def __setitem__(self, key, value):
-        # Allow dictionary-style assignment of attributes
-        setattr(self, key, value)
-
-
-class LiteLLM_Params(BaseModel):
-    model: str
-    tpm: Optional[int] = None
-    rpm: Optional[int] = None
-    api_key: Optional[str] = None
-    api_base: Optional[str] = None
-    api_version: Optional[str] = None
-    timeout: Optional[Union[float, str]] = None  # if str, pass in as os.environ/
-    stream_timeout: Optional[Union[float, str]] = (
-        None  # timeout when making stream=True calls, if str, pass in as os.environ/
-    )
-    max_retries: int = 2  # follows openai default of 2
-    organization: Optional[str] = None  # for openai orgs
-
-    def __init__(self, max_retries: Optional[Union[int, str]] = None, **params):
-        if max_retries is None:
-            max_retries = 2
-        elif isinstance(max_retries, str):
-            max_retries = int(max_retries)  # cast to int
-        super().__init__(max_retries=max_retries, **params)
-
-    class Config:
-        extra = "allow"
-
-    def __contains__(self, key):
-        # Define custom behavior for the 'in' operator
-        return hasattr(self, key)
-
-    def get(self, key, default=None):
-        # Custom .get() method to access attributes with a default value if the attribute doesn't exist
-        return getattr(self, key, default)
-
-    def __getitem__(self, key):
-        # Allow dictionary-style access to attributes
-        return getattr(self, key)
-
-    def __setitem__(self, key, value):
-        # Allow dictionary-style assignment of attributes
-        setattr(self, key, value)
-
-
-class Deployment(BaseModel):
-    model_name: str
-    litellm_params: LiteLLM_Params
-    model_info: ModelInfo
-
-    def to_json(self, **kwargs):
-        try:
-            return self.model_dump(**kwargs)  # noqa
-        except Exception as e:
-            # if using pydantic v1
-            return self.dict(**kwargs)
-
-    class Config:
-        extra = "allow"
-
-    def __contains__(self, key):
-        # Define custom behavior for the 'in' operator
-        return hasattr(self, key)
-
-    def get(self, key, default=None):
-        # Custom .get() method to access attributes with a default value if the attribute doesn't exist
-        return getattr(self, key, default)
-
-    def __getitem__(self, key):
-        # Allow dictionary-style access to attributes
-        return getattr(self, key)
-
-    def __setitem__(self, key, value):
-        # Allow dictionary-style assignment of attributes
-        setattr(self, key, value)
+from litellm.types.router import Deployment, ModelInfo, LiteLLM_Params, RouterErrors


 class Router:
@ -182,6 +78,7 @@ class Router:
            "latency-based-routing",
        ] = "simple-shuffle",
        routing_strategy_args: dict = {},  # just for latency-based routing
+        semaphore: Optional[asyncio.Semaphore] = None,
    ) -> None:
        """
        Initialize the Router class with the given parameters for caching, reliability, and routing strategy.
@ -247,6 +144,8 @@ class Router:
        router = Router(model_list=model_list, fallbacks=[{"azure-gpt-3.5-turbo": "openai-gpt-3.5-turbo"}])
        ```
        """
+        if semaphore:
+            self.semaphore = semaphore
        self.set_verbose = set_verbose
        self.debug_level = debug_level
        self.enable_pre_call_checks = enable_pre_call_checks
@ -378,6 +277,12 @@ class Router:
            )
            if isinstance(litellm.callbacks, list):
                litellm.callbacks.append(self.lowesttpm_logger)  # type: ignore
+        elif routing_strategy == "usage-based-routing-v2":
+            self.lowesttpm_logger_v2 = LowestTPMLoggingHandler_v2(
+                router_cache=self.cache, model_list=self.model_list
+            )
+            if isinstance(litellm.callbacks, list):
+                litellm.callbacks.append(self.lowesttpm_logger_v2)  # type: ignore
        elif routing_strategy == "latency-based-routing":
            self.lowestlatency_logger = LowestLatencyLoggingHandler(
                router_cache=self.cache,
@ -507,19 +412,26 @@ class Router:
            raise e

    async def _acompletion(self, model: str, messages: List[Dict[str, str]], **kwargs):
+        """
+        - Get an available deployment
+        - call it with a semaphore over the call
+        - semaphore specific to it's rpm
+        - in the semaphore,  make a check against it's local rpm before running
+        """
        model_name = None
        try:
            verbose_router_logger.debug(
                f"Inside _acompletion()- model: {model}; kwargs: {kwargs}"
            )
-            deployment = self.get_available_deployment(
+
+            deployment = await self.async_get_available_deployment(
                model=model,
                messages=messages,
                specific_deployment=kwargs.pop("specific_deployment", None),
            )
-            if self.set_verbose == True and self.debug_level == "DEBUG":
+
            # debug how often this deployment picked
-                self._print_deployment_metrics(deployment=deployment)
+            self._track_deployment_metrics(deployment=deployment)

            kwargs.setdefault("metadata", {}).update(
                {
@ -541,6 +453,7 @@ class Router:
            potential_model_client = self._get_client(
                deployment=deployment, kwargs=kwargs, client_type="async"
            )
+
            # check if provided keys == client keys #
            dynamic_api_key = kwargs.get("api_key", None)
            if (
@ -563,7 +476,7 @@ class Router:
                )  # this uses default_litellm_params when nothing is set
            )

-            response = await litellm.acompletion(
+            _response = litellm.acompletion(
                **{
                    **data,
                    "messages": messages,
@ -573,13 +486,32 @@ class Router:
                    **kwargs,
                }
            )
+
+            rpm_semaphore = self._get_client(
+                deployment=deployment, kwargs=kwargs, client_type="rpm_client"
+            )
+
+            if (
+                rpm_semaphore is not None
+                and isinstance(rpm_semaphore, asyncio.Semaphore)
+                and self.routing_strategy == "usage-based-routing-v2"
+            ):
+                async with rpm_semaphore:
+                    """
+                    - Check rpm limits before making the call
+                    """
+                    await self.lowesttpm_logger_v2.pre_call_rpm_check(deployment)
+                    response = await _response
+            else:
+                response = await _response
+
            self.success_calls[model_name] += 1
            verbose_router_logger.info(
                f"litellm.acompletion(model={model_name})\033[32m 200 OK\033[0m"
            )
-            if self.set_verbose == True and self.debug_level == "DEBUG":
            # debug how often this deployment picked
-                self._print_deployment_metrics(deployment=deployment, response=response)
+            self._track_deployment_metrics(deployment=deployment, response=response)
+
            return response
        except Exception as e:
            verbose_router_logger.info(
@ -686,7 +618,7 @@ class Router:
            verbose_router_logger.debug(
                f"Inside _image_generation()- model: {model}; kwargs: {kwargs}"
            )
-            deployment = self.get_available_deployment(
+            deployment = await self.async_get_available_deployment(
                model=model,
                messages=[{"role": "user", "content": "prompt"}],
                specific_deployment=kwargs.pop("specific_deployment", None),
@ -786,7 +718,7 @@ class Router:
            verbose_router_logger.debug(
                f"Inside _atranscription()- model: {model}; kwargs: {kwargs}"
            )
-            deployment = self.get_available_deployment(
+            deployment = await self.async_get_available_deployment(
                model=model,
                messages=[{"role": "user", "content": "prompt"}],
                specific_deployment=kwargs.pop("specific_deployment", None),
@ -866,7 +798,7 @@ class Router:
            verbose_router_logger.debug(
                f"Inside _moderation()- model: {model}; kwargs: {kwargs}"
            )
-            deployment = self.get_available_deployment(
+            deployment = await self.async_get_available_deployment(
                model=model,
                input=input,
                specific_deployment=kwargs.pop("specific_deployment", None),
@ -1009,7 +941,7 @@ class Router:
            verbose_router_logger.debug(
                f"Inside _atext_completion()- model: {model}; kwargs: {kwargs}"
            )
-            deployment = self.get_available_deployment(
+            deployment = await self.async_get_available_deployment(
                model=model,
                messages=[{"role": "user", "content": prompt}],
                specific_deployment=kwargs.pop("specific_deployment", None),
@ -1171,11 +1103,12 @@ class Router:
            raise e

    async def _aembedding(self, input: Union[str, List], model: str, **kwargs):
+        model_name = None
        try:
            verbose_router_logger.debug(
                f"Inside _aembedding()- model: {model}; kwargs: {kwargs}"
            )
-            deployment = self.get_available_deployment(
+            deployment = await self.async_get_available_deployment(
                model=model,
                input=input,
                specific_deployment=kwargs.pop("specific_deployment", None),
@ -1363,6 +1296,8 @@ class Router:
                    min_timeout=self.retry_after,
                )
                await asyncio.sleep(timeout)
+            elif RouterErrors.user_defined_ratelimit_error.value in str(e):
+                raise e  # don't wait to retry if deployment hits user-defined rate-limit
            elif hasattr(original_exception, "status_code") and litellm._should_retry(
                status_code=original_exception.status_code
            ):
@ -1678,6 +1613,24 @@ class Router:
        except Exception as e:
            raise e

+    def _update_usage(self, deployment_id: str):
+        """
+        Update deployment rpm for that minute
+        """
+        rpm_key = deployment_id
+
+        request_count = self.cache.get_cache(key=rpm_key, local_only=True)
+        if request_count is None:
+            request_count = 1
+            self.cache.set_cache(
+                key=rpm_key, value=request_count, local_only=True, ttl=60
+            )  # only store for 60s
+        else:
+            request_count += 1
+            self.cache.set_cache(
+                key=rpm_key, value=request_count, local_only=True
+            )  # don't change existing ttl
+
    def _set_cooldown_deployments(self, deployment: Optional[str] = None):
        """
        Add a model to the list of models being cooled down for that minute, if it exceeds the allowed fails / minute
@ -1685,7 +1638,8 @@ class Router:
        if deployment is None:
            return

-        current_minute = datetime.now().strftime("%H-%M")
+        dt = get_utc_datetime()
+        current_minute = dt.strftime("%H-%M")
        # get current fails for deployment
        # update the number of failed calls
        # if it's > allowed fails
@ -1723,11 +1677,29 @@ class Router:
                key=deployment, value=updated_fails, ttl=cooldown_time
            )

+    async def _async_get_cooldown_deployments(self):
+        """
+        Async implementation of '_get_cooldown_deployments'
+        """
+        dt = get_utc_datetime()
+        current_minute = dt.strftime("%H-%M")
+        # get the current cooldown list for that minute
+        cooldown_key = f"{current_minute}:cooldown_models"
+
+        # ----------------------
+        # Return cooldown models
+        # ----------------------
+        cooldown_models = await self.cache.async_get_cache(key=cooldown_key) or []
+
+        verbose_router_logger.debug(f"retrieve cooldown models: {cooldown_models}")
+        return cooldown_models
+
    def _get_cooldown_deployments(self):
        """
        Get the list of models being cooled down for this minute
        """
-        current_minute = datetime.now().strftime("%H-%M")
+        dt = get_utc_datetime()
+        current_minute = dt.strftime("%H-%M")
        # get the current cooldown list for that minute
        cooldown_key = f"{current_minute}:cooldown_models"

@ -1741,12 +1713,26 @@ class Router:

    def set_client(self, model: dict):
        """
-        Initializes Azure/OpenAI clients. Stores them in cache, b/c of this - https://github.com/BerriAI/litellm/issues/1278
+        - Initializes Azure/OpenAI clients. Stores them in cache, b/c of this - https://github.com/BerriAI/litellm/issues/1278
+        - Initializes Semaphore for client w/ rpm. Stores them in cache. b/c of this - https://github.com/BerriAI/litellm/issues/2994
        """
        client_ttl = self.client_ttl
        litellm_params = model.get("litellm_params", {})
        model_name = litellm_params.get("model")
        model_id = model["model_info"]["id"]
+        # ### IF RPM SET - initialize a semaphore ###
+        rpm = litellm_params.get("rpm", None)
+        if rpm:
+            semaphore = asyncio.Semaphore(rpm)
+            cache_key = f"{model_id}_rpm_client"
+            self.cache.set_cache(
+                key=cache_key,
+                value=semaphore,
+                local_only=True,
+            )
+
+        #     print("STORES SEMAPHORE IN CACHE")
+
        ####  for OpenAI / Azure we need to initalize the Client for High Traffic ########
        custom_llm_provider = litellm_params.get("custom_llm_provider")
        custom_llm_provider = custom_llm_provider or model_name.split("/", 1)[0] or ""
@ -1762,11 +1748,18 @@ class Router:
            model_name in litellm.open_ai_chat_completion_models
            or custom_llm_provider in litellm.openai_compatible_providers
            or custom_llm_provider == "azure"
+            or custom_llm_provider == "azure_text"
            or custom_llm_provider == "custom_openai"
            or custom_llm_provider == "openai"
+            or custom_llm_provider == "text-completion-openai"
            or "ft:gpt-3.5-turbo" in model_name
            or model_name in litellm.open_ai_embedding_models
        ):
+            if custom_llm_provider == "azure":
+                if litellm.utils._is_non_openai_azure_model(model_name):
+                    custom_llm_provider = "openai"
+                    # remove azure prefx from model_name
+                    model_name = model_name.replace("azure/", "")
            # glorified / complicated reading of configs
            # user can pass vars directly or they can pas os.environ/AZURE_API_KEY, in which case we will read the env
            # we do this here because we init clients for Azure, OpenAI and we need to set the right key
@ -1954,8 +1947,12 @@ class Router:
                        local_only=True,
                    )  # cache for 1 hr
                else:
+                    _api_key = api_key
+                    if _api_key is not None and isinstance(_api_key, str):
+                        # only show first 5 chars of api_key
+                        _api_key = _api_key[:8] + "*" * 15
                    verbose_router_logger.debug(
-                        f"Initializing Azure OpenAI Client for {model_name}, Api Base: {str(api_base)}, Api Key:{api_key}"
+                        f"Initializing Azure OpenAI Client for {model_name}, Api Base: {str(api_base)}, Api Key:{_api_key}"
                    )
                    azure_client_params = {
                        "api_key": api_key,
@ -2052,8 +2049,12 @@ class Router:
                    )  # cache for 1 hr

            else:
+                _api_key = api_key
+                if _api_key is not None and isinstance(_api_key, str):
+                    # only show first 5 chars of api_key
+                    _api_key = _api_key[:8] + "*" * 15
                verbose_router_logger.debug(
-                    f"Initializing OpenAI Client for {model_name}, Api Base:{str(api_base)}, Api Key:{api_key}"
+                    f"Initializing OpenAI Client for {model_name}, Api Base:{str(api_base)}, Api Key:{_api_key}"
                )
                cache_key = f"{model_id}_async_client"
                _client = openai.AsyncOpenAI(  # type: ignore
@ -2145,6 +2146,34 @@ class Router:
                    local_only=True,
                )  # cache for 1 hr

+    def _generate_model_id(self, model_group: str, litellm_params: dict):
+        """
+        Helper function to consistently generate the same id for a deployment
+
+        - create a string from all the litellm params
+        - hash
+        - use hash as id
+        """
+        concat_str = model_group
+        for k, v in litellm_params.items():
+            if isinstance(k, str):
+                concat_str += k
+            elif isinstance(k, dict):
+                concat_str += json.dumps(k)
+            else:
+                concat_str += str(k)
+
+            if isinstance(v, str):
+                concat_str += v
+            elif isinstance(v, dict):
+                concat_str += json.dumps(v)
+            else:
+                concat_str += str(v)
+
+        hash_object = hashlib.sha256(concat_str.encode())
+
+        return hash_object.hexdigest()
+
    def set_model_list(self, model_list: list):
        original_model_list = copy.deepcopy(model_list)
        self.model_list = []
@ -2160,7 +2189,13 @@ class Router:
                    if isinstance(v, str) and v.startswith("os.environ/"):
                        _litellm_params[k] = litellm.get_secret(v)

-            _model_info = model.pop("model_info", {})
+            _model_info: dict = model.pop("model_info", {})
+
+            # check if model info has id
+            if "id" not in _model_info:
+                _id = self._generate_model_id(_model_name, _litellm_params)
+                _model_info["id"] = _id
+
            deployment = Deployment(
                **model,
                model_name=_model_name,
@ -2253,6 +2288,36 @@ class Router:
        self.model_names.append(deployment.model_name)
        return

+    def delete_deployment(self, id: str) -> Optional[Deployment]:
+        """
+        Parameters:
+        - id: str - the id of the deployment to be deleted
+
+        Returns:
+        - The deleted deployment
+        - OR None (if deleted deployment not found)
+        """
+        deployment_idx = None
+        for idx, m in enumerate(self.model_list):
+            if m["model_info"]["id"] == id:
+                deployment_idx = idx
+
+        try:
+            if deployment_idx is not None:
+                item = self.model_list.pop(deployment_idx)
+                return item
+            else:
+                return None
+        except:
+            return None
+
+    def get_deployment(self, model_id: str):
+        for model in self.model_list:
+            if "model_info" in model and "id" in model["model_info"]:
+                if model_id == model["model_info"]["id"]:
+                    return model
+        return None
+
    def get_model_ids(self):
        ids = []
        for model in self.model_list:
@ -2265,7 +2330,9 @@ class Router:
        return self.model_names

    def get_model_list(self):
+        if hasattr(self, "model_list"):
            return self.model_list
+        return None

    def _get_client(self, deployment, kwargs, client_type=None):
        """
@ -2280,7 +2347,11 @@ class Router:
            The appropriate client based on the given client_type and kwargs.
        """
        model_id = deployment["model_info"]["id"]
-        if client_type == "async":
+        if client_type == "rpm_client":
+            cache_key = "{}_rpm_client".format(model_id)
+            client = self.cache.get_cache(key=cache_key, local_only=True)
+            return client
+        elif client_type == "async":
            if kwargs.get("stream") == True:
                cache_key = f"{model_id}_stream_async_client"
                client = self.cache.get_cache(key=cache_key, local_only=True)
@ -2333,6 +2404,7 @@ class Router:
        Filter out model in model group, if:

        - model context window < message length
+        - filter models above rpm limits
        - [TODO] function call and model doesn't support function calling
        """
        verbose_router_logger.debug(
@ -2348,6 +2420,16 @@ class Router:
        except Exception as e:
            return _returned_deployments

+        _context_window_error = False
+        _rate_limit_error = False
+
+        ## get model group RPM ##
+        dt = get_utc_datetime()
+        current_minute = dt.strftime("%H-%M")
+        rpm_key = f"{model}:rpm:{current_minute}"
+        model_group_cache = (
+            self.cache.get_cache(key=rpm_key, local_only=True) or {}
+        )  # check the in-memory cache used by lowest_latency and usage-based routing. Only check the local cache.
        for idx, deployment in enumerate(_returned_deployments):
            # see if we have the info for this model
            try:
@ -2360,8 +2442,6 @@ class Router:
                    "model", None
                )
                model_info = litellm.get_model_info(model=model)
-            except:
-                continue

                if (
                    isinstance(model_info, dict)
@ -2372,17 +2452,57 @@ class Router:
                        and input_tokens > model_info["max_input_tokens"]
                    ):
                        invalid_model_indices.append(idx)
+                        _context_window_error = True
+                        continue
+            except Exception as e:
+                verbose_router_logger.debug("An error occurs - {}".format(str(e)))
+
+            ## RPM CHECK ##
+            _litellm_params = deployment.get("litellm_params", {})
+            model_id = deployment.get("model_info", {}).get("id", "")
+            ### get local router cache ###
+            current_request_cache_local = (
+                self.cache.get_cache(key=model_id, local_only=True) or 0
+            )
+            ### get usage based cache ###
+            if isinstance(model_group_cache, dict):
+                model_group_cache[model_id] = model_group_cache.get(model_id, 0)
+
+                current_request = max(
+                    current_request_cache_local, model_group_cache[model_id]
+                )
+
+                if (
+                    isinstance(_litellm_params, dict)
+                    and _litellm_params.get("rpm", None) is not None
+                ):
+                    if (
+                        isinstance(_litellm_params["rpm"], int)
+                        and _litellm_params["rpm"] <= current_request
+                    ):
+                        invalid_model_indices.append(idx)
+                        _rate_limit_error = True
+                        continue

        if len(invalid_model_indices) == len(_returned_deployments):
            """
-            - no healthy deployments available b/c context window checks
+            - no healthy deployments available b/c context window checks or rate limit error
+
+            - First check for rate limit errors (if this is true, it means the model passed the context window check but failed the rate limit check)
            """
+
+            if _rate_limit_error == True:  # allow generic fallback logic to take place
+                raise ValueError(
+                    f"No deployments available for selected model, passed model={model}"
+                )
+            elif _context_window_error == True:
                raise litellm.ContextWindowExceededError(
                    message="Context Window exceeded for given call",
                    model=model,
                    llm_provider="",
                    response=httpx.Response(
-                    status_code=400, request=httpx.Request("GET", "https://example.com")
+                        status_code=400,
+                        request=httpx.Request("GET", "https://example.com"),
                    ),
                )
        if len(invalid_model_indices) > 0:
@ -2391,7 +2511,7 @@ class Router:

        return _returned_deployments

-    def get_available_deployment(
+    def _common_checks_available_deployment(
        self,
        model: str,
        messages: Optional[List[Dict[str, str]]] = None,
@ -2399,11 +2519,11 @@ class Router:
        specific_deployment: Optional[bool] = False,
    ):
        """
-        Returns the deployment based on routing strategy
-        """
+        Common checks for 'get_available_deployment' across sync + async call.

-        # users need to explicitly call a specific deployment, by setting `specific_deployment = True` as completion()/embedding() kwarg
-        # When this was no explicit we had several issues with fallbacks timing out
+        If 'healthy_deployments' returned is None, this means the user chose a specific deployment
+        """
+        # check if aliases set on litellm model alias map
        if specific_deployment == True:
            # users can also specify a specific deployment name. At this point we should check if they are just trying to call a specific deployment
            for deployment in self.model_list:
@ -2411,12 +2531,11 @@ class Router:
                if deployment_model == model:
                    # User Passed a specific deployment name on their config.yaml, example azure/chat-gpt-v-2
                    # return the first deployment where the `model` matches the specificed deployment name
-                    return deployment
+                    return deployment, None
            raise ValueError(
                f"LiteLLM Router: Trying to call specific deployment, but Model:{model} does not exist in Model List: {self.model_list}"
            )

-        # check if aliases set on litellm model alias map
        if model in self.model_group_alias:
            verbose_router_logger.debug(
                f"Using a model alias. Got Request for {model}, sending requests to {self.model_group_alias.get(model)}"
@ -2428,7 +2547,7 @@ class Router:
                self.default_deployment
            )  # self.default_deployment
            updated_deployment["litellm_params"]["model"] = model
-            return updated_deployment
+            return updated_deployment, None

        ## get healthy deployments
        ### get all deployments
@ -2443,6 +2562,118 @@ class Router:
            f"initial list of deployments: {healthy_deployments}"
        )

+        verbose_router_logger.debug(
+            f"healthy deployments: length {len(healthy_deployments)} {healthy_deployments}"
+        )
+        if len(healthy_deployments) == 0:
+            raise ValueError(f"No healthy deployment available, passed model={model}")
+        if litellm.model_alias_map and model in litellm.model_alias_map:
+            model = litellm.model_alias_map[
+                model
+            ]  # update the model to the actual value if an alias has been passed in
+
+        return model, healthy_deployments
+
+    async def async_get_available_deployment(
+        self,
+        model: str,
+        messages: Optional[List[Dict[str, str]]] = None,
+        input: Optional[Union[str, List]] = None,
+        specific_deployment: Optional[bool] = False,
+    ):
+        """
+        Async implementation of 'get_available_deployments'.
+
+        Allows all cache calls to be made async => 10x perf impact (8rps -> 100 rps).
+        """
+        if (
+            self.routing_strategy != "usage-based-routing-v2"
+        ):  # prevent regressions for other routing strategies, that don't have async get available deployments implemented.
+            return self.get_available_deployment(
+                model=model,
+                messages=messages,
+                input=input,
+                specific_deployment=specific_deployment,
+            )
+
+        model, healthy_deployments = self._common_checks_available_deployment(
+            model=model,
+            messages=messages,
+            input=input,
+            specific_deployment=specific_deployment,
+        )
+
+        if healthy_deployments is None:
+            return model
+
+        # filter out the deployments currently cooling down
+        deployments_to_remove = []
+        # cooldown_deployments is a list of model_id's cooling down, cooldown_deployments = ["16700539-b3cd-42f4-b426-6a12a1bb706a", "16700539-b3cd-42f4-b426-7899"]
+        cooldown_deployments = await self._async_get_cooldown_deployments()
+        verbose_router_logger.debug(
+            f"async cooldown deployments: {cooldown_deployments}"
+        )
+        # Find deployments in model_list whose model_id is cooling down
+        for deployment in healthy_deployments:
+            deployment_id = deployment["model_info"]["id"]
+            if deployment_id in cooldown_deployments:
+                deployments_to_remove.append(deployment)
+        # remove unhealthy deployments from healthy deployments
+        for deployment in deployments_to_remove:
+            healthy_deployments.remove(deployment)
+
+        # filter pre-call checks
+        if self.enable_pre_call_checks and messages is not None:
+            healthy_deployments = self._pre_call_checks(
+                model=model, healthy_deployments=healthy_deployments, messages=messages
+            )
+
+        if (
+            self.routing_strategy == "usage-based-routing-v2"
+            and self.lowesttpm_logger_v2 is not None
+        ):
+            deployment = await self.lowesttpm_logger_v2.async_get_available_deployments(
+                model_group=model,
+                healthy_deployments=healthy_deployments,
+                messages=messages,
+                input=input,
+            )
+
+        if deployment is None:
+            verbose_router_logger.info(
+                f"get_available_deployment for model: {model}, No deployment available"
+            )
+            raise ValueError(
+                f"No deployments available for selected model, passed model={model}"
+            )
+        verbose_router_logger.info(
+            f"get_available_deployment for model: {model}, Selected deployment: {self.print_deployment(deployment)} for model: {model}"
+        )
+        return deployment
+
+    def get_available_deployment(
+        self,
+        model: str,
+        messages: Optional[List[Dict[str, str]]] = None,
+        input: Optional[Union[str, List]] = None,
+        specific_deployment: Optional[bool] = False,
+    ):
+        """
+        Returns the deployment based on routing strategy
+        """
+        # users need to explicitly call a specific deployment, by setting `specific_deployment = True` as completion()/embedding() kwarg
+        # When this was no explicit we had several issues with fallbacks timing out
+
+        model, healthy_deployments = self._common_checks_available_deployment(
+            model=model,
+            messages=messages,
+            input=input,
+            specific_deployment=specific_deployment,
+        )
+
+        if healthy_deployments is None:
+            return model
+
        # filter out the deployments currently cooling down
        deployments_to_remove = []
        # cooldown_deployments is a list of model_id's cooling down, cooldown_deployments = ["16700539-b3cd-42f4-b426-6a12a1bb706a", "16700539-b3cd-42f4-b426-7899"]
@ -2463,16 +2694,6 @@ class Router:
                model=model, healthy_deployments=healthy_deployments, messages=messages
            )

-        verbose_router_logger.debug(
-            f"healthy deployments: length {len(healthy_deployments)} {healthy_deployments}"
-        )
-        if len(healthy_deployments) == 0:
-            raise ValueError(f"No healthy deployment available, passed model={model}")
-        if litellm.model_alias_map and model in litellm.model_alias_map:
-            model = litellm.model_alias_map[
-                model
-            ]  # update the model to the actual value if an alias has been passed in
-
        if self.routing_strategy == "least-busy" and self.leastbusy_logger is not None:
            deployment = self.leastbusy_logger.get_available_deployments(
                model_group=model, healthy_deployments=healthy_deployments
@ -2534,7 +2755,16 @@ class Router:
                messages=messages,
                input=input,
            )
-
+        elif (
+            self.routing_strategy == "usage-based-routing-v2"
+            and self.lowesttpm_logger_v2 is not None
+        ):
+            deployment = self.lowesttpm_logger_v2.get_available_deployments(
+                model_group=model,
+                healthy_deployments=healthy_deployments,
+                messages=messages,
+                input=input,
+            )
        if deployment is None:
            verbose_router_logger.info(
                f"get_available_deployment for model: {model}, No deployment available"
@ -2547,7 +2777,7 @@ class Router:
        )
        return deployment

-    def _print_deployment_metrics(self, deployment, response=None):
+    def _track_deployment_metrics(self, deployment, response=None):
        try:
            litellm_params = deployment["litellm_params"]
            api_base = litellm_params.get("api_base", "")
@ -2558,6 +2788,7 @@ class Router:

                # update self.deployment_stats
                if model_id is not None:
+                    self._update_usage(model_id)  # update in-memory cache for tracking
                    if model_id in self.deployment_stats:
                        # only update num_requests
                        self.deployment_stats[model_id]["num_requests"] += 1
@ -2569,7 +2800,10 @@ class Router:
                        }
            else:
                # check response_ms and update num_successes
+                if isinstance(response, dict):
                    response_ms = response.get("_response_ms", 0)
+                else:
+                    response_ms = 0
                if model_id is not None:
                    if model_id in self.deployment_stats:
                        # check if avg_latency exists
@ -2594,15 +2828,18 @@ class Router:
                            "num_successes": 1,
                            "avg_latency": response_ms,
                        }
+            if self.set_verbose == True and self.debug_level == "DEBUG":
                from pprint import pformat

                # Assuming self.deployment_stats is your dictionary
                formatted_stats = pformat(self.deployment_stats)

                # Assuming verbose_router_logger is your logger
-            verbose_router_logger.info("self.deployment_stats: \n%s", formatted_stats)
+                verbose_router_logger.info(
+                    "self.deployment_stats: \n%s", formatted_stats
+                )
        except Exception as e:
-            verbose_router_logger.error(f"Error in _print_deployment_metrics: {str(e)}")
+            verbose_router_logger.error(f"Error in _track_deployment_metrics: {str(e)}")

    def flush_cache(self):
        litellm.cache = None
--- a/litellm/router_strategy/lowest_latency.py
+++ b/litellm/router_strategy/lowest_latency.py
@ -298,6 +298,4 @@ class LowestLatencyLoggingHandler(CustomLogger):
            elif item_latency < lowest_latency:
                lowest_latency = item_latency
                deployment = _deployment
-        if deployment is None:
-            deployment = random.choice(healthy_deployments)
        return deployment
--- a/litellm/router_strategy/lowest_tpm_rpm_v2.py
+++ b/litellm/router_strategy/lowest_tpm_rpm_v2.py
@ -0,0 +1,403 @@
+#### What this does ####
+#   identifies lowest tpm deployment
+
+import dotenv, os, requests, random
+from typing import Optional, Union, List, Dict
+import datetime as datetime_og
+from datetime import datetime
+
+dotenv.load_dotenv()  # Loading env variables using dotenv
+import traceback, asyncio, httpx
+import litellm
+from litellm import token_counter
+from litellm.caching import DualCache
+from litellm.integrations.custom_logger import CustomLogger
+from litellm._logging import verbose_router_logger
+from litellm.utils import print_verbose, get_utc_datetime
+from litellm.types.router import RouterErrors
+
+
+class LowestTPMLoggingHandler_v2(CustomLogger):
+    """
+    Updated version of TPM/RPM Logging.
+
+    Meant to work across instances.
+
+    Caches individual models, not model_groups
+
+    Uses batch get (redis.mget)
+
+    Increments tpm/rpm limit using redis.incr
+    """
+
+    test_flag: bool = False
+    logged_success: int = 0
+    logged_failure: int = 0
+    default_cache_time_seconds: int = 1 * 60 * 60  # 1 hour
+
+    def __init__(self, router_cache: DualCache, model_list: list):
+        self.router_cache = router_cache
+        self.model_list = model_list
+
+    async def pre_call_rpm_check(self, deployment: dict) -> dict:
+        """
+        Pre-call check + update model rpm
+        - Used inside semaphore
+        - raise rate limit error if deployment over limit
+
+        Why? solves concurrency issue - https://github.com/BerriAI/litellm/issues/2994
+
+        Returns - deployment
+
+        Raises - RateLimitError if deployment over defined RPM limit
+        """
+        try:
+
+            # ------------
+            # Setup values
+            # ------------
+            dt = get_utc_datetime()
+            current_minute = dt.strftime("%H-%M")
+            model_group = deployment.get("model_name", "")
+            rpm_key = f"{model_group}:rpm:{current_minute}"
+            local_result = await self.router_cache.async_get_cache(
+                key=rpm_key, local_only=True
+            )  # check local result first
+
+            deployment_rpm = None
+            if deployment_rpm is None:
+                deployment_rpm = deployment.get("rpm")
+            if deployment_rpm is None:
+                deployment_rpm = deployment.get("litellm_params", {}).get("rpm")
+            if deployment_rpm is None:
+                deployment_rpm = deployment.get("model_info", {}).get("rpm")
+            if deployment_rpm is None:
+                deployment_rpm = float("inf")
+
+            if local_result is not None and local_result >= deployment_rpm:
+                raise litellm.RateLimitError(
+                    message="Deployment over defined rpm limit={}. current usage={}".format(
+                        deployment_rpm, local_result
+                    ),
+                    llm_provider="",
+                    model=deployment.get("litellm_params", {}).get("model"),
+                    response=httpx.Response(
+                        status_code=429,
+                        content="{} rpm limit={}. current usage={}".format(
+                            RouterErrors.user_defined_ratelimit_error.value,
+                            deployment_rpm,
+                            local_result,
+                        ),
+                        request=httpx.Request(method="tpm_rpm_limits", url="https://github.com/BerriAI/litellm"),  # type: ignore
+                    ),
+                )
+            else:
+                # if local result below limit, check redis ## prevent unnecessary redis checks
+                result = await self.router_cache.async_increment_cache(
+                    key=rpm_key, value=1
+                )
+                if result is not None and result > deployment_rpm:
+                    raise litellm.RateLimitError(
+                        message="Deployment over defined rpm limit={}. current usage={}".format(
+                            deployment_rpm, result
+                        ),
+                        llm_provider="",
+                        model=deployment.get("litellm_params", {}).get("model"),
+                        response=httpx.Response(
+                            status_code=429,
+                            content="{} rpm limit={}. current usage={}".format(
+                                RouterErrors.user_defined_ratelimit_error.value,
+                                deployment_rpm,
+                                result,
+                            ),
+                            request=httpx.Request(method="tpm_rpm_limits", url="https://github.com/BerriAI/litellm"),  # type: ignore
+                        ),
+                    )
+            return deployment
+        except Exception as e:
+            if isinstance(e, litellm.RateLimitError):
+                raise e
+            return deployment  # don't fail calls if eg. redis fails to connect
+
+    def log_success_event(self, kwargs, response_obj, start_time, end_time):
+        try:
+            """
+            Update TPM/RPM usage on success
+            """
+            if kwargs["litellm_params"].get("metadata") is None:
+                pass
+            else:
+                model_group = kwargs["litellm_params"]["metadata"].get(
+                    "model_group", None
+                )
+
+                id = kwargs["litellm_params"].get("model_info", {}).get("id", None)
+                if model_group is None or id is None:
+                    return
+                elif isinstance(id, int):
+                    id = str(id)
+
+                total_tokens = response_obj["usage"]["total_tokens"]
+
+                # ------------
+                # Setup values
+                # ------------
+                dt = get_utc_datetime()
+                current_minute = dt.strftime("%H-%M")
+                tpm_key = f"{model_group}:tpm:{current_minute}"
+                rpm_key = f"{model_group}:rpm:{current_minute}"
+
+                # ------------
+                # Update usage
+                # ------------
+
+                ## TPM
+                request_count_dict = self.router_cache.get_cache(key=tpm_key) or {}
+                request_count_dict[id] = request_count_dict.get(id, 0) + total_tokens
+
+                self.router_cache.set_cache(key=tpm_key, value=request_count_dict)
+
+                ## RPM
+                request_count_dict = self.router_cache.get_cache(key=rpm_key) or {}
+                request_count_dict[id] = request_count_dict.get(id, 0) + 1
+
+                self.router_cache.set_cache(key=rpm_key, value=request_count_dict)
+
+                ### TESTING ###
+                if self.test_flag:
+                    self.logged_success += 1
+        except Exception as e:
+            traceback.print_exc()
+            pass
+
+    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
+        try:
+            """
+            Update TPM usage on success
+            """
+            if kwargs["litellm_params"].get("metadata") is None:
+                pass
+            else:
+                model_group = kwargs["litellm_params"]["metadata"].get(
+                    "model_group", None
+                )
+
+                id = kwargs["litellm_params"].get("model_info", {}).get("id", None)
+                if model_group is None or id is None:
+                    return
+                elif isinstance(id, int):
+                    id = str(id)
+
+                total_tokens = response_obj["usage"]["total_tokens"]
+
+                # ------------
+                # Setup values
+                # ------------
+                dt = get_utc_datetime()
+                current_minute = dt.strftime(
+                    "%H-%M"
+                )  # use the same timezone regardless of system clock
+
+                tpm_key = f"{id}:tpm:{current_minute}"
+                # ------------
+                # Update usage
+                # ------------
+                # update cache
+
+                ## TPM
+                await self.router_cache.async_increment_cache(
+                    key=tpm_key, value=total_tokens
+                )
+
+                ### TESTING ###
+                if self.test_flag:
+                    self.logged_success += 1
+        except Exception as e:
+            traceback.print_exc()
+            pass
+
+    def _common_checks_available_deployment(
+        self,
+        model_group: str,
+        healthy_deployments: list,
+        tpm_keys: list,
+        tpm_values: list,
+        rpm_keys: list,
+        rpm_values: list,
+        messages: Optional[List[Dict[str, str]]] = None,
+        input: Optional[Union[str, List]] = None,
+    ):
+        """
+        Common checks for get available deployment, across sync + async implementations
+        """
+        tpm_dict = {}  # {model_id: 1, ..}
+        for idx, key in enumerate(tpm_keys):
+            tpm_dict[tpm_keys[idx]] = tpm_values[idx]
+
+        rpm_dict = {}  # {model_id: 1, ..}
+        for idx, key in enumerate(rpm_keys):
+            rpm_dict[rpm_keys[idx]] = rpm_values[idx]
+
+        try:
+            input_tokens = token_counter(messages=messages, text=input)
+        except:
+            input_tokens = 0
+        verbose_router_logger.debug(f"input_tokens={input_tokens}")
+        # -----------------------
+        # Find lowest used model
+        # ----------------------
+        lowest_tpm = float("inf")
+
+        if tpm_dict is None:  # base case - none of the deployments have been used
+            # initialize a tpm dict with {model_id: 0}
+            tpm_dict = {}
+            for deployment in healthy_deployments:
+                tpm_dict[deployment["model_info"]["id"]] = 0
+        else:
+            for d in healthy_deployments:
+                ## if healthy deployment not yet used
+                if d["model_info"]["id"] not in tpm_dict:
+                    tpm_dict[d["model_info"]["id"]] = 0
+
+        all_deployments = tpm_dict
+
+        deployment = None
+        for item, item_tpm in all_deployments.items():
+            ## get the item from model list
+            _deployment = None
+            for m in healthy_deployments:
+                if item == m["model_info"]["id"]:
+                    _deployment = m
+
+            if _deployment is None:
+                continue  # skip to next one
+
+            _deployment_tpm = None
+            if _deployment_tpm is None:
+                _deployment_tpm = _deployment.get("tpm")
+            if _deployment_tpm is None:
+                _deployment_tpm = _deployment.get("litellm_params", {}).get("tpm")
+            if _deployment_tpm is None:
+                _deployment_tpm = _deployment.get("model_info", {}).get("tpm")
+            if _deployment_tpm is None:
+                _deployment_tpm = float("inf")
+
+            _deployment_rpm = None
+            if _deployment_rpm is None:
+                _deployment_rpm = _deployment.get("rpm")
+            if _deployment_rpm is None:
+                _deployment_rpm = _deployment.get("litellm_params", {}).get("rpm")
+            if _deployment_rpm is None:
+                _deployment_rpm = _deployment.get("model_info", {}).get("rpm")
+            if _deployment_rpm is None:
+                _deployment_rpm = float("inf")
+
+            if item_tpm + input_tokens > _deployment_tpm:
+                continue
+            elif (rpm_dict is not None and item in rpm_dict) and (
+                rpm_dict[item] + 1 > _deployment_rpm
+            ):
+                continue
+            elif item_tpm < lowest_tpm:
+                lowest_tpm = item_tpm
+                deployment = _deployment
+        print_verbose("returning picked lowest tpm/rpm deployment.")
+        return deployment
+
+    async def async_get_available_deployments(
+        self,
+        model_group: str,
+        healthy_deployments: list,
+        messages: Optional[List[Dict[str, str]]] = None,
+        input: Optional[Union[str, List]] = None,
+    ):
+        """
+        Async implementation of get deployments.
+
+        Reduces time to retrieve the tpm/rpm values from cache
+        """
+        # get list of potential deployments
+        verbose_router_logger.debug(
+            f"get_available_deployments - Usage Based. model_group: {model_group}, healthy_deployments: {healthy_deployments}"
+        )
+
+        dt = get_utc_datetime()
+        current_minute = dt.strftime("%H-%M")
+        tpm_keys = []
+        rpm_keys = []
+        for m in healthy_deployments:
+            if isinstance(m, dict):
+                id = m.get("model_info", {}).get(
+                    "id"
+                )  # a deployment should always have an 'id'. this is set in router.py
+                tpm_key = "{}:tpm:{}".format(id, current_minute)
+                rpm_key = "{}:rpm:{}".format(id, current_minute)
+
+                tpm_keys.append(tpm_key)
+                rpm_keys.append(rpm_key)
+
+        tpm_values = await self.router_cache.async_batch_get_cache(
+            keys=tpm_keys
+        )  # [1, 2, None, ..]
+        rpm_values = await self.router_cache.async_batch_get_cache(
+            keys=rpm_keys
+        )  # [1, 2, None, ..]
+
+        return self._common_checks_available_deployment(
+            model_group=model_group,
+            healthy_deployments=healthy_deployments,
+            tpm_keys=tpm_keys,
+            tpm_values=tpm_values,
+            rpm_keys=rpm_keys,
+            rpm_values=rpm_values,
+            messages=messages,
+            input=input,
+        )
+
+    def get_available_deployments(
+        self,
+        model_group: str,
+        healthy_deployments: list,
+        messages: Optional[List[Dict[str, str]]] = None,
+        input: Optional[Union[str, List]] = None,
+    ):
+        """
+        Returns a deployment with the lowest TPM/RPM usage.
+        """
+        # get list of potential deployments
+        verbose_router_logger.debug(
+            f"get_available_deployments - Usage Based. model_group: {model_group}, healthy_deployments: {healthy_deployments}"
+        )
+
+        dt = get_utc_datetime()
+        current_minute = dt.strftime("%H-%M")
+        tpm_keys = []
+        rpm_keys = []
+        for m in healthy_deployments:
+            if isinstance(m, dict):
+                id = m.get("model_info", {}).get(
+                    "id"
+                )  # a deployment should always have an 'id'. this is set in router.py
+                tpm_key = "{}:tpm:{}".format(id, current_minute)
+                rpm_key = "{}:rpm:{}".format(id, current_minute)
+
+                tpm_keys.append(tpm_key)
+                rpm_keys.append(rpm_key)
+
+        tpm_values = self.router_cache.batch_get_cache(
+            keys=tpm_keys
+        )  # [1, 2, None, ..]
+        rpm_values = self.router_cache.batch_get_cache(
+            keys=rpm_keys
+        )  # [1, 2, None, ..]
+
+        return self._common_checks_available_deployment(
+            model_group=model_group,
+            healthy_deployments=healthy_deployments,
+            tpm_keys=tpm_keys,
+            tpm_values=tpm_values,
+            rpm_keys=rpm_keys,
+            rpm_values=rpm_values,
+            messages=messages,
+            input=input,
+        )
--- a/litellm/tests/log.txt
+++ b/litellm/tests/log.txt
--- a/litellm/tests/test_alerting.py
+++ b/litellm/tests/test_alerting.py
@ -0,0 +1,67 @@
+# What is this?
+## Tests slack alerting on proxy logging object
+
+import sys
+import os
+import io, asyncio
+from datetime import datetime
+
+# import logging
+# logging.basicConfig(level=logging.DEBUG)
+sys.path.insert(0, os.path.abspath("../.."))
+from litellm.proxy.utils import ProxyLogging
+from litellm.caching import DualCache
+import litellm
+import pytest
+
+
+@pytest.mark.asyncio
+async def test_get_api_base():
+    _pl = ProxyLogging(user_api_key_cache=DualCache())
+    _pl.update_values(alerting=["slack"], alerting_threshold=100, redis_cache=None)
+    model = "chatgpt-v-2"
+    messages = [{"role": "user", "content": "Hey how's it going?"}]
+    litellm_params = {
+        "acompletion": True,
+        "api_key": None,
+        "api_base": "https://openai-gpt-4-test-v-1.openai.azure.com/",
+        "force_timeout": 600,
+        "logger_fn": None,
+        "verbose": False,
+        "custom_llm_provider": "azure",
+        "litellm_call_id": "68f46d2d-714d-4ad8-8137-69600ec8755c",
+        "model_alias_map": {},
+        "completion_call_id": None,
+        "metadata": None,
+        "model_info": None,
+        "proxy_server_request": None,
+        "preset_cache_key": None,
+        "no-log": False,
+        "stream_response": {},
+    }
+    start_time = datetime.now()
+    end_time = datetime.now()
+
+    time_difference_float, model, api_base, messages = (
+        _pl._response_taking_too_long_callback(
+            kwargs={
+                "model": model,
+                "messages": messages,
+                "litellm_params": litellm_params,
+            },
+            start_time=start_time,
+            end_time=end_time,
+        )
+    )
+
+    assert api_base is not None
+    assert isinstance(api_base, str)
+    assert len(api_base) > 0
+    request_info = (
+        f"\nRequest Model: `{model}`\nAPI Base: `{api_base}`\nMessages: `{messages}`"
+    )
+    slow_message = f"`Responses are slow - {round(time_difference_float,2)}s response time > Alerting threshold: {100}s`"
+    await _pl.alerting_handler(
+        message=slow_message + request_info,
+        level="Low",
+    )
--- a/litellm/tests/test_amazing_s3_logs.py
+++ b/litellm/tests/test_amazing_s3_logs.py
@ -15,6 +15,7 @@ import time, random
 import pytest


+@pytest.mark.skip(reason="AWS Suspended Account")
 def test_s3_logging():
    # all s3 requests need to be in one test function
    # since we are modifying stdout, and pytests runs tests in parallel
@ -124,6 +125,7 @@ def test_s3_logging():
 # test_s3_logging()


+@pytest.mark.skip(reason="AWS Suspended Account")
 def test_s3_logging_async():
    # this tests time added to make s3 logging calls, vs just acompletion calls
    try:
--- a/litellm/tests/test_amazing_vertex_completion.py
+++ b/litellm/tests/test_amazing_vertex_completion.py
@ -12,6 +12,7 @@ import pytest, asyncio
 import litellm
 from litellm import embedding, completion, completion_cost, Timeout, acompletion
 from litellm import RateLimitError
+from litellm.tests.test_streaming import streaming_format_tests
 import json
 import os
 import tempfile
@ -22,6 +23,40 @@ user_message = "Write a short poem about the sky"
 messages = [{"content": user_message, "role": "user"}]


+def get_vertex_ai_creds_json() -> dict:
+    # Define the path to the vertex_key.json file
+    print("loading vertex ai credentials")
+    filepath = os.path.dirname(os.path.abspath(__file__))
+    vertex_key_path = filepath + "/vertex_key.json"
+
+    # Read the existing content of the file or create an empty dictionary
+    try:
+        with open(vertex_key_path, "r") as file:
+            # Read the file content
+            print("Read vertexai file path")
+            content = file.read()
+
+            # If the file is empty or not valid JSON, create an empty dictionary
+            if not content or not content.strip():
+                service_account_key_data = {}
+            else:
+                # Attempt to load the existing JSON content
+                file.seek(0)
+                service_account_key_data = json.load(file)
+    except FileNotFoundError:
+        # If the file doesn't exist, create an empty dictionary
+        service_account_key_data = {}
+
+    # Update the service_account_key_data with environment variables
+    private_key_id = os.environ.get("VERTEX_AI_PRIVATE_KEY_ID", "")
+    private_key = os.environ.get("VERTEX_AI_PRIVATE_KEY", "")
+    private_key = private_key.replace("\\n", "\n")
+    service_account_key_data["private_key_id"] = private_key_id
+    service_account_key_data["private_key"] = private_key
+
+    return service_account_key_data
+
+
 def load_vertex_ai_credentials():
    # Define the path to the vertex_key.json file
    print("loading vertex ai credentials")
@ -84,6 +119,120 @@ async def get_response():
        pytest.fail(f"An error occurred - {str(e)}")


+# @pytest.mark.skip(
+#     reason="Local test. Vertex AI Quota is low. Leads to rate limit errors on ci/cd."
+# )
+def test_vertex_ai_anthropic():
+    model = "claude-3-sonnet@20240229"
+
+    vertex_ai_project = "adroit-crow-413218"
+    vertex_ai_location = "asia-southeast1"
+    json_obj = get_vertex_ai_creds_json()
+    vertex_credentials = json.dumps(json_obj)
+
+    response = completion(
+        model="vertex_ai/" + model,
+        messages=[{"role": "user", "content": "hi"}],
+        temperature=0.7,
+        vertex_ai_project=vertex_ai_project,
+        vertex_ai_location=vertex_ai_location,
+        vertex_credentials=vertex_credentials,
+    )
+    print("\nModel Response", response)
+
+
+# @pytest.mark.skip(
+#     reason="Local test. Vertex AI Quota is low. Leads to rate limit errors on ci/cd."
+# )
+def test_vertex_ai_anthropic_streaming():
+    # load_vertex_ai_credentials()
+
+    # litellm.set_verbose = True
+
+    model = "claude-3-sonnet@20240229"
+
+    vertex_ai_project = "adroit-crow-413218"
+    vertex_ai_location = "asia-southeast1"
+    json_obj = get_vertex_ai_creds_json()
+    vertex_credentials = json.dumps(json_obj)
+
+    response = completion(
+        model="vertex_ai/" + model,
+        messages=[{"role": "user", "content": "hi"}],
+        temperature=0.7,
+        vertex_ai_project=vertex_ai_project,
+        vertex_ai_location=vertex_ai_location,
+        stream=True,
+    )
+    # print("\nModel Response", response)
+    for chunk in response:
+        print(f"chunk: {chunk}")
+
+    # raise Exception("it worked!")
+
+
+# test_vertex_ai_anthropic_streaming()
+
+
+# @pytest.mark.skip(
+#     reason="Local test. Vertex AI Quota is low. Leads to rate limit errors on ci/cd."
+# )
+@pytest.mark.asyncio
+async def test_vertex_ai_anthropic_async():
+    # load_vertex_ai_credentials()
+
+    model = "claude-3-sonnet@20240229"
+
+    vertex_ai_project = "adroit-crow-413218"
+    vertex_ai_location = "asia-southeast1"
+    json_obj = get_vertex_ai_creds_json()
+    vertex_credentials = json.dumps(json_obj)
+
+    response = await acompletion(
+        model="vertex_ai/" + model,
+        messages=[{"role": "user", "content": "hi"}],
+        temperature=0.7,
+        vertex_ai_project=vertex_ai_project,
+        vertex_ai_location=vertex_ai_location,
+        vertex_credentials=vertex_credentials,
+    )
+    print(f"Model Response: {response}")
+
+
+# asyncio.run(test_vertex_ai_anthropic_async())
+
+
+# @pytest.mark.skip(
+#     reason="Local test. Vertex AI Quota is low. Leads to rate limit errors on ci/cd."
+# )
+@pytest.mark.asyncio
+async def test_vertex_ai_anthropic_async_streaming():
+    # load_vertex_ai_credentials()
+    litellm.set_verbose = True
+    model = "claude-3-sonnet@20240229"
+
+    vertex_ai_project = "adroit-crow-413218"
+    vertex_ai_location = "asia-southeast1"
+    json_obj = get_vertex_ai_creds_json()
+    vertex_credentials = json.dumps(json_obj)
+
+    response = await acompletion(
+        model="vertex_ai/" + model,
+        messages=[{"role": "user", "content": "hi"}],
+        temperature=0.7,
+        vertex_ai_project=vertex_ai_project,
+        vertex_ai_location=vertex_ai_location,
+        vertex_credentials=vertex_credentials,
+        stream=True,
+    )
+
+    async for chunk in response:
+        print(f"chunk: {chunk}")
+
+
+# asyncio.run(test_vertex_ai_anthropic_async_streaming())
+
+
 def test_vertex_ai():
    import random

@ -95,8 +244,8 @@ def test_vertex_ai():
        + litellm.vertex_code_text_models
    )
    litellm.set_verbose = False
-    vertex_ai_project = "reliablekeys"
-    # litellm.vertex_project = "reliablekeys"
+    vertex_ai_project = "adroit-crow-413218"
+    # litellm.vertex_project = "adroit-crow-413218"

    test_models = random.sample(test_models, 1)
    test_models += litellm.vertex_language_models  # always test gemini-pro
@ -111,7 +260,6 @@ def test_vertex_ai():
                "text-bison@001",
                "gemini-1.5-pro",
                "gemini-1.5-pro-preview-0215",
-                "gemini-1.5-pro-vision",
            ]:
                # our account does not have access to this model
                continue
@ -142,7 +290,7 @@ def test_vertex_ai():
 def test_vertex_ai_stream():
    load_vertex_ai_credentials()
    litellm.set_verbose = True
-    litellm.vertex_project = "reliablekeys"
+    litellm.vertex_project = "adroit-crow-413218"
    import random

    test_models = (
@ -164,7 +312,6 @@ def test_vertex_ai_stream():
                "text-bison@001",
                "gemini-1.5-pro",
                "gemini-1.5-pro-preview-0215",
-                "gemini-1.5-pro-vision",
            ]:
                # our account does not have access to this model
                continue
@ -218,7 +365,6 @@ async def test_async_vertexai_response():
            "text-bison@001",
            "gemini-1.5-pro",
            "gemini-1.5-pro-preview-0215",
-            "gemini-1.5-pro-vision",
        ]:
            # our account does not have access to this model
            continue
@ -263,7 +409,6 @@ async def test_async_vertexai_streaming_response():
            "text-bison@001",
            "gemini-1.5-pro",
            "gemini-1.5-pro-preview-0215",
-            "gemini-1.5-pro-vision",
        ]:
            # our account does not have access to this model
            continue
@ -326,7 +471,8 @@ def test_gemini_pro_vision():
        # DO Not DELETE this ASSERT
        # Google counts the prompt tokens for us, we should ensure we use the tokens from the orignal response
        assert prompt_tokens == 263  # the gemini api returns 263 to us
-
+    except litellm.RateLimitError as e:
+        pass
    except Exception as e:
        if "500 Internal error encountered.'" in str(e):
            pass
@ -406,6 +552,7 @@ def test_gemini_pro_function_calling():
            },
        }
    ]
+
    messages = [{"role": "user", "content": "What's the weather like in Boston today?"}]
    completion = litellm.completion(
        model="gemini-pro", messages=messages, tools=tools, tool_choice="auto"
@ -413,6 +560,47 @@ def test_gemini_pro_function_calling():
    print(f"completion: {completion}")
    assert completion.choices[0].message.content is None
    assert len(completion.choices[0].message.tool_calls) == 1
+    try:
+        load_vertex_ai_credentials()
+        tools = [
+            {
+                "type": "function",
+                "function": {
+                    "name": "get_current_weather",
+                    "description": "Get the current weather in a given location",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "location": {
+                                "type": "string",
+                                "description": "The city and state, e.g. San Francisco, CA",
+                            },
+                            "unit": {
+                                "type": "string",
+                                "enum": ["celsius", "fahrenheit"],
+                            },
+                        },
+                        "required": ["location"],
+                    },
+                },
+            }
+        ]
+        messages = [
+            {"role": "user", "content": "What's the weather like in Boston today?"}
+        ]
+        completion = litellm.completion(
+            model="gemini-pro", messages=messages, tools=tools, tool_choice="auto"
+        )
+        print(f"completion: {completion}")
+        assert completion.choices[0].message.content is None
+        assert len(completion.choices[0].message.tool_calls) == 1
+    except litellm.RateLimitError as e:
+        pass
+    except Exception as e:
+        if "429 Quota exceeded" in str(e):
+            pass
+        else:
+            return


 # gemini_pro_function_calling()
@ -442,6 +630,7 @@ def test_gemini_pro_function_calling_streaming():
        }
    ]
    messages = [{"role": "user", "content": "What's the weather like in Boston today?"}]
+    try:
        completion = litellm.completion(
            model="gemini-pro",
            messages=messages,
@ -454,6 +643,8 @@ def test_gemini_pro_function_calling_streaming():
        # assert len(completion.choices[0].message.tool_calls) == 1
        for chunk in completion:
            print(f"chunk: {chunk}")
+    except litellm.RateLimitError as e:
+        pass


@pytest.mark.asyncio
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@ -345,6 +345,83 @@ async def test_embedding_caching_azure_individual_items():
    assert embedding_val_2._hidden_params["cache_hit"] == True


+@pytest.mark.asyncio
+async def test_embedding_caching_azure_individual_items_reordered():
+    """
+    Tests caching for individual items in an embedding list
+
+    - Cache an item
+    - call aembedding(..) with the item + 1 unique item
+    - compare to a 2nd aembedding (...) with 2 unique items
+    ```
+    embedding_1 = ["hey how's it going", "I'm doing well"]
+    embedding_val_1 = embedding(...)
+
+    embedding_2 = ["hey how's it going", "I'm fine"]
+    embedding_val_2 = embedding(...)
+
+    assert embedding_val_1[0]["id"] == embedding_val_2[0]["id"]
+    ```
+    """
+    litellm.cache = Cache()
+    common_msg = f"{uuid.uuid4()}"
+    common_msg_2 = f"hey how's it going {uuid.uuid4()}"
+    embedding_1 = [common_msg_2, common_msg]
+    embedding_2 = [
+        common_msg,
+        f"I'm fine {uuid.uuid4()}",
+    ]
+
+    embedding_val_1 = await aembedding(
+        model="azure/azure-embedding-model", input=embedding_1, caching=True
+    )
+    embedding_val_2 = await aembedding(
+        model="azure/azure-embedding-model", input=embedding_2, caching=True
+    )
+    print(f"embedding_val_2._hidden_params: {embedding_val_2._hidden_params}")
+    assert embedding_val_2._hidden_params["cache_hit"] == True
+
+    assert embedding_val_2.data[0]["embedding"] == embedding_val_1.data[1]["embedding"]
+    assert embedding_val_2.data[0]["index"] != embedding_val_1.data[1]["index"]
+    assert embedding_val_2.data[0]["index"] == 0
+    assert embedding_val_1.data[1]["index"] == 1
+
+
+@pytest.mark.asyncio
+async def test_embedding_caching_base_64():
+    """ """
+    litellm.cache = Cache(
+        type="redis",
+        host=os.environ["REDIS_HOST"],
+        port=os.environ["REDIS_PORT"],
+    )
+    import uuid
+
+    inputs = [
+        f"{uuid.uuid4()} hello this is ishaan",
+        f"{uuid.uuid4()} hello this is ishaan again",
+    ]
+
+    embedding_val_1 = await aembedding(
+        model="azure/azure-embedding-model",
+        input=inputs,
+        caching=True,
+        encoding_format="base64",
+    )
+    embedding_val_2 = await aembedding(
+        model="azure/azure-embedding-model",
+        input=inputs,
+        caching=True,
+        encoding_format="base64",
+    )
+
+    assert embedding_val_2._hidden_params["cache_hit"] == True
+    print(embedding_val_2)
+    print(embedding_val_1)
+    assert embedding_val_2.data[0]["embedding"] == embedding_val_1.data[0]["embedding"]
+    assert embedding_val_2.data[1]["embedding"] == embedding_val_1.data[1]["embedding"]
+
+
@pytest.mark.asyncio
 async def test_redis_cache_basic():
    """
@ -630,6 +707,39 @@ async def test_redis_cache_acompletion_stream():
 # test_redis_cache_acompletion_stream()


+@pytest.mark.asyncio
+async def test_redis_cache_atext_completion():
+    try:
+        litellm.set_verbose = True
+        prompt = f"write a one sentence poem about: {uuid.uuid4()}"
+        litellm.cache = Cache(
+            type="redis",
+            host=os.environ["REDIS_HOST"],
+            port=os.environ["REDIS_PORT"],
+            password=os.environ["REDIS_PASSWORD"],
+            supported_call_types=["atext_completion"],
+        )
+        print("test for caching, atext_completion")
+
+        response1 = await litellm.atext_completion(
+            model="gpt-3.5-turbo-instruct", prompt=prompt, max_tokens=40, temperature=1
+        )
+
+        await asyncio.sleep(0.5)
+        print("\n\n Response 1 content: ", response1, "\n\n")
+
+        response2 = await litellm.atext_completion(
+            model="gpt-3.5-turbo-instruct", prompt=prompt, max_tokens=40, temperature=1
+        )
+
+        print(response2)
+
+        assert response1.id == response2.id
+    except Exception as e:
+        print(f"{str(e)}\n\n{traceback.format_exc()}")
+        raise e
+
+
@pytest.mark.asyncio
 async def test_redis_cache_acompletion_stream_bedrock():
    import asyncio
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -7,13 +7,13 @@ import os, io

 sys.path.insert(
    0, os.path.abspath("../..")
-)  # Adds the parent directory to the system path
+)  # Adds the parent directory to the, system path
 import pytest
 import litellm
 from litellm import embedding, completion, completion_cost, Timeout
 from litellm import RateLimitError

-# litellm.num_retries = 3
+# litellm.num_retries=3
 litellm.cache = None
 litellm.success_callback = []
 user_message = "Write a short poem about the sky"
@ -50,7 +50,22 @@ def test_completion_custom_provider_model_name():
        pytest.fail(f"Error occurred: {e}")


-# test_completion_custom_provider_model_name()
+def test_completion_azure_command_r():
+    try:
+        litellm.set_verbose = True
+
+        response = completion(
+            model="azure/command-r-plus",
+            api_base=os.getenv("AZURE_COHERE_API_BASE"),
+            api_key=os.getenv("AZURE_COHERE_API_KEY"),
+            messages=[{"role": "user", "content": "What is the meaning of life?"}],
+        )
+
+        print(response)
+    except litellm.Timeout as e:
+        pass
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")


 def test_completion_claude():
@ -477,6 +492,31 @@ def test_completion_claude2_1():

 # test_completion_claude2_1()

+
+@pytest.mark.asyncio
+async def test_acompletion_claude2_1():
+    try:
+        litellm.set_verbose = True
+        print("claude2.1 test request")
+        messages = [
+            {
+                "role": "system",
+                "content": "Your goal is generate a joke on the topic user gives.",
+            },
+            {"role": "user", "content": "Generate a 3 liner joke for me"},
+        ]
+        # test without max tokens
+        response = await litellm.acompletion(model="claude-2.1", messages=messages)
+        # Add any assertions here to check the response
+        print(response)
+        print(response.usage)
+        print(response.usage.completion_tokens)
+        print(response["usage"]["completion_tokens"])
+        # print("new cost tracking")
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+
 # def test_completion_oobabooga():
 #     try:
 #         response = completion(
@ -581,7 +621,7 @@ def test_completion_gpt4_vision():


 def test_completion_azure_gpt4_vision():
-    # azure/gpt-4, vision takes 5 seconds to respond
+    # azure/gpt-4, vision takes 5-seconds to respond
    try:
        litellm.set_verbose = True
        response = completion(
@ -960,6 +1000,19 @@ def test_completion_text_openai():
        pytest.fail(f"Error occurred: {e}")


+@pytest.mark.asyncio
+async def test_completion_text_openai_async():
+    try:
+        # litellm.set_verbose =True
+        response = await litellm.acompletion(
+            model="gpt-3.5-turbo-instruct", messages=messages
+        )
+        print(response["choices"][0]["message"]["content"])
+    except Exception as e:
+        print(e)
+        pytest.fail(f"Error occurred: {e}")
+
+
 def custom_callback(
    kwargs,  # kwargs to completion
    completion_response,  # response from completion
@ -1604,9 +1657,9 @@ def test_completion_replicate_vicuna():

 def test_replicate_custom_prompt_dict():
    litellm.set_verbose = True
-    model_name = "replicate/meta/llama-2-7b-chat"
+    model_name = "replicate/meta/llama-2-70b-chat"
    litellm.register_prompt_template(
-        model="replicate/meta/llama-2-7b-chat",
+        model="replicate/meta/llama-2-70b-chat",
        initial_prompt_value="You are a good assistant",  # [OPTIONAL]
        roles={
            "system": {
@ -1624,6 +1677,7 @@ def test_replicate_custom_prompt_dict():
        },
        final_prompt_value="Now answer as best you can:",  # [OPTIONAL]
    )
+    try:
        response = completion(
            model=model_name,
            messages=[
@ -1632,8 +1686,15 @@ def test_replicate_custom_prompt_dict():
                    "content": "what is yc write 1 paragraph",
                }
            ],
+            repetition_penalty=0.1,
            num_retries=3,
        )
+    except litellm.APIError as e:
+        pass
+    except litellm.APIConnectionError as e:
+        pass
+    except Exception as e:
+        pytest.fail(f"An exception occurred - {str(e)}")
    print(f"response: {response}")
    litellm.custom_prompt_dict = {}  # reset

--- a/Show more
+++ b/Show more
				`@ -1 +0,0 @@`
				`(self.webpackChunk_N_E=self.webpackChunk_N_E\|\|[]).push([[185],{87421:function(n,e,t){Promise.resolve().then(t.t.bind(t,99646,23)),Promise.resolve().then(t.t.bind(t,63385,23))},63385:function(){},99646:function(n){n.exports={style:{fontFamily:"'__Inter_c23dc8', '__Inter_Fallback_c23dc8'",fontStyle:"normal"},className:"__className_c23dc8"}}},function(n){n.O(0,[971,69,744],function(){return n(n.s=87421)}),_N_E=n.O()}]);`