Merge branch 'main' into patch-1

2024-05-17 10:26:14 +02:00 · 2024-05-17 10:26:14 +02:00 · 80ef0f86d1
commit 80ef0f86d1
parent e2213a1a5e 186f11612a
332 changed files with 43212 additions and 8160 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -1,4 +1,4 @@
-version: 2.1
+version: 4.3.4
 jobs:
  local_testing:
    docker:
@ -40,7 +40,7 @@ jobs:
            pip install "aioboto3==12.3.0"
            pip install langchain
            pip install lunary==0.2.5
-            pip install "langfuse==2.7.3"
+            pip install "langfuse==2.27.1"
            pip install numpydoc
            pip install traceloop-sdk==0.0.69
            pip install openai
@ -57,6 +57,9 @@ jobs:
            pip install "pytest-mock==3.12.0"
            pip install python-multipart
            pip install google-cloud-aiplatform
+            pip install prometheus-client==0.20.0
+            pip install "pydantic==2.7.1"
+            pip install "diskcache==5.6.1"
      - save_cache:
          paths:
            - ./venv
@ -187,22 +190,28 @@ jobs:
          command: |
            docker run -d \
              -p 4000:4000 \
-              -e DATABASE_URL=$PROXY_DOCKER_DB_URL \
+              -e DATABASE_URL=$PROXY_DATABASE_URL \
              -e AZURE_API_KEY=$AZURE_API_KEY \
+              -e REDIS_HOST=$REDIS_HOST \
+              -e REDIS_PASSWORD=$REDIS_PASSWORD \
+              -e REDIS_PORT=$REDIS_PORT \
              -e AZURE_FRANCE_API_KEY=$AZURE_FRANCE_API_KEY \
              -e AZURE_EUROPE_API_KEY=$AZURE_EUROPE_API_KEY \
              -e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
              -e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
              -e AWS_REGION_NAME=$AWS_REGION_NAME \
+              -e AUTO_INFER_REGION=True \
              -e OPENAI_API_KEY=$OPENAI_API_KEY \
+              -e LANGFUSE_PROJECT1_PUBLIC=$LANGFUSE_PROJECT1_PUBLIC \
+              -e LANGFUSE_PROJECT2_PUBLIC=$LANGFUSE_PROJECT2_PUBLIC \
+              -e LANGFUSE_PROJECT1_SECRET=$LANGFUSE_PROJECT1_SECRET \
+              -e LANGFUSE_PROJECT2_SECRET=$LANGFUSE_PROJECT2_SECRET \
              --name my-app \
              -v $(pwd)/proxy_server_config.yaml:/app/config.yaml \
              my-app:latest \
              --config /app/config.yaml \
              --port 4000 \
-              --num_workers 8 \
              --detailed_debug \
-              --run_gunicorn \
      - run:
          name: Install curl and dockerize
          command: |
@ -217,7 +226,7 @@ jobs:
          background: true
      - run: 
          name: Wait for app to be ready
-          command: dockerize -wait http://localhost:4000 -timeout 1m
+          command: dockerize -wait http://localhost:4000 -timeout 5m
      - run:
          name: Run tests
          command: |
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@ -0,0 +1,51 @@
+{
+	"name": "Python 3.11",
+	// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
+	"image": "mcr.microsoft.com/devcontainers/python:3.11-bookworm",
+	// https://github.com/devcontainers/images/tree/main/src/python
+	// https://mcr.microsoft.com/en-us/product/devcontainers/python/tags
+
+	// "build": {
+	// 	"dockerfile": "Dockerfile",
+	// 	"context": ".."
+	// },
+
+	// Features to add to the dev container. More info: https://containers.dev/features.
+	// "features": {},
+
+	// Configure tool-specific properties.
+	"customizations": {
+		// Configure properties specific to VS Code.
+		"vscode": {
+			"settings": {},
+			"extensions": [
+				"ms-python.python",
+				"ms-python.vscode-pylance",
+				"GitHub.copilot",
+				"GitHub.copilot-chat"
+			]
+		}
+	},
+	
+	// Use 'forwardPorts' to make a list of ports inside the container available locally.
+	"forwardPorts": [4000],
+
+	"containerEnv": {
+		"LITELLM_LOG": "DEBUG"
+	},
+
+	// Use 'portsAttributes' to set default properties for specific forwarded ports. 
+	// More info: https://containers.dev/implementors/json_reference/#port-attributes
+	"portsAttributes": {
+		"4000": {
+			"label": "LiteLLM Server",
+			"onAutoForward": "notify"
+		}
+	},
+
+	// More info: https://aka.ms/dev-containers-non-root.
+	// "remoteUser": "litellm",
+
+	// Use 'postCreateCommand' to run commands after the container is created.
+	"postCreateCommand": "pipx install poetry && poetry install -E extra_proxy -E proxy"
+}
--- a/.dockerignore
+++ b/.dockerignore
@ -1,5 +1,5 @@
-/docs
-/cookbook
-/.circleci
-/.github
-/tests
+docs
+cookbook
+.circleci
+.github
+tests
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@ -0,0 +1,10 @@
+# Add the commit hash of any commit you want to ignore in `git blame` here.
+# One commit hash per line.
+#
+# The GitHub Blame UI will use this file automatically!
+#
+# Run this command to always ignore formatting commits in `git blame`
+#     git config blame.ignoreRevsFile .git-blame-ignore-revs
+
+# Update pydantic code to fix warnings (GH-3600)
+876840e9957bc7e9f7d6a2b58c4d7c53dad16481
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@ -0,0 +1,29 @@
+## Title
+
+<!-- e.g. "Implement user authentication feature" -->
+
+## Relevant issues
+
+<!-- e.g. "Fixes #000" -->
+
+## Type
+
+<!-- Select the type of Pull Request -->
+<!-- Keep only the necessary ones -->
+
+🆕 New Feature
+🐛 Bug Fix
+🧹 Refactoring
+📖 Documentation
+🚄 Infrastructure
+✅ Test
+
+## Changes
+
+<!-- List of changes -->
+
+## [REQUIRED] Testing - Attach a screenshot of any new tests passing locall
+If UI changes, send a screenshot/GIF of working UI fixes
+
+<!-- Test procedure -->
+
--- a/.github/workflows/interpret_load_test.py
+++ b/.github/workflows/interpret_load_test.py
@ -64,6 +64,11 @@ if __name__ == "__main__":
    )  # Replace with your repository's username and name
    latest_release = repo.get_latest_release()
    print("got latest release: ", latest_release)
+    print(latest_release.title)
+    print(latest_release.tag_name)
+
+    release_version = latest_release.title
+
    print("latest release body: ", latest_release.body)
    print("markdown table: ", markdown_table)

@ -74,8 +79,25 @@ if __name__ == "__main__":
        start_index = latest_release.body.find("Load Test LiteLLM Proxy Results")
        existing_release_body = latest_release.body[:start_index]

+    docker_run_command = f"""
+\n\n
+## Docker Run LiteLLM Proxy
+
+```
+docker run \\
+-e STORE_MODEL_IN_DB=True \\
+-p 4000:4000 \\
+ghcr.io/berriai/litellm:main-{release_version}
+```
+    """
+    print("docker run command: ", docker_run_command)
+
    new_release_body = (
        existing_release_body
+        + docker_run_command
+        + "\n\n"
+        + "### Don't want to maintain your internal proxy? get in touch 🎉"
+        + "\nHosted Proxy Alpha: https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat"
        + "\n\n"
        + "## Load Test LiteLLM Proxy Results"
        + "\n\n"
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,6 @@
 .venv
 .env
+litellm/proxy/myenv/*
 litellm_uuid.txt
 __pycache__/
 *.pyc
@ -50,3 +51,8 @@ kub.yaml
 loadtest_kub.yaml
 litellm/proxy/_new_secret_config.yaml
 litellm/proxy/_new_secret_config.yaml
+litellm/proxy/_super_secret_config.yaml
+litellm/proxy/_super_secret_config.yaml
+litellm/proxy/myenv/bin/activate
+litellm/proxy/myenv/bin/Activate.ps1
+myenv/*
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -7,7 +7,7 @@ repos:
    rev: 7.0.0  # The version of flake8 to use
    hooks:
    -  id: flake8
-       exclude: ^litellm/tests/|^litellm/proxy/proxy_cli.py|^litellm/integrations/|^litellm/proxy/tests/
+       exclude: ^litellm/tests/|^litellm/proxy/proxy_cli.py|^litellm/proxy/tests/
       additional_dependencies: [flake8-print]
       files: litellm/.*\.py
 -   repo: local
--- a/README.md
+++ b/README.md
@ -5,7 +5,7 @@
        <p align="center">Call all LLM APIs using the OpenAI format [Bedrock, Huggingface, VertexAI, TogetherAI, Azure, OpenAI, etc.]
        <br>
    </p>
-<h4 align="center"><a href="https://docs.litellm.ai/docs/simple_proxy" target="_blank">OpenAI Proxy Server</a> | <a href="https://docs.litellm.ai/docs/enterprise"target="_blank">Enterprise Tier</a></h4>
+<h4 align="center"><a href="https://docs.litellm.ai/docs/simple_proxy" target="_blank">OpenAI Proxy Server</a> | <a href="https://docs.litellm.ai/docs/hosted" target="_blank"> Hosted Proxy (Preview)</a> | <a href="https://docs.litellm.ai/docs/enterprise"target="_blank">Enterprise Tier</a></h4>
 <h4 align="center">
    <a href="https://pypi.org/project/litellm/" target="_blank">
        <img src="https://img.shields.io/pypi/v/litellm.svg" alt="PyPI Version">
@ -128,7 +128,9 @@ response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content

 # OpenAI Proxy - ([Docs](https://docs.litellm.ai/docs/simple_proxy))

-Set Budgets & Rate limits across multiple projects
+Track spend + Load Balance across multiple projects
+
+[Hosted Proxy (Preview)](https://docs.litellm.ai/docs/hosted)

 The proxy provides:

@ -224,7 +226,9 @@ curl 'http://0.0.0.0:4000/key/generate' \
 | [deepinfra](https://docs.litellm.ai/docs/providers/deepinfra)                       | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
 | [perplexity-ai](https://docs.litellm.ai/docs/providers/perplexity)                  | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
 | [Groq AI](https://docs.litellm.ai/docs/providers/groq)                              | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
+| [Deepseek](https://docs.litellm.ai/docs/providers/deepseek)                         | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
 | [anyscale](https://docs.litellm.ai/docs/providers/anyscale)                         | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
+| [IBM - watsonx.ai](https://docs.litellm.ai/docs/providers/watsonx)                  | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                | ✅ 
 | [voyage ai](https://docs.litellm.ai/docs/providers/voyage)                          |                                                         |                                                                                 |                                                                                     |                                                                                   | ✅                                                                            |
 | [xinference [Xorbits Inference]](https://docs.litellm.ai/docs/providers/xinference) |                                                         |                                                                                 |                                                                                     |                                                                                   | ✅                                                                            |

@ -245,7 +249,7 @@ Step 2: Navigate into the project, and install dependencies:

 ```
 cd litellm
-poetry install
+poetry install -E extra_proxy -E proxy
 ```

 Step 3: Test your change:
--- a/cookbook/Proxy_Batch_Users.ipynb
+++ b/cookbook/Proxy_Batch_Users.ipynb
@ -0,0 +1,204 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "680oRk1af-xJ"
+      },
+      "source": [
+        "# Environment Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "X7TgJFn8f88p"
+      },
+      "outputs": [],
+      "source": [
+        "import csv\n",
+        "from typing import Optional\n",
+        "import httpx, json\n",
+        "import asyncio\n",
+        "\n",
+        "proxy_base_url = \"http://0.0.0.0:4000\" # 👈 SET TO PROXY URL\n",
+        "master_key = \"sk-1234\" # 👈 SET TO PROXY MASTER KEY"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "rauw8EOhgBz5"
+      },
+      "outputs": [],
+      "source": [
+        "## GLOBAL HTTP CLIENT ## - faster http calls\n",
+        "class HTTPHandler:\n",
+        "    def __init__(self, concurrent_limit=1000):\n",
+        "        # Create a client with a connection pool\n",
+        "        self.client = httpx.AsyncClient(\n",
+        "            limits=httpx.Limits(\n",
+        "                max_connections=concurrent_limit,\n",
+        "                max_keepalive_connections=concurrent_limit,\n",
+        "            )\n",
+        "        )\n",
+        "\n",
+        "    async def close(self):\n",
+        "        # Close the client when you're done with it\n",
+        "        await self.client.aclose()\n",
+        "\n",
+        "    async def get(\n",
+        "        self, url: str, params: Optional[dict] = None, headers: Optional[dict] = None\n",
+        "    ):\n",
+        "        response = await self.client.get(url, params=params, headers=headers)\n",
+        "        return response\n",
+        "\n",
+        "    async def post(\n",
+        "        self,\n",
+        "        url: str,\n",
+        "        data: Optional[dict] = None,\n",
+        "        params: Optional[dict] = None,\n",
+        "        headers: Optional[dict] = None,\n",
+        "    ):\n",
+        "        try:\n",
+        "            response = await self.client.post(\n",
+        "                url, data=data, params=params, headers=headers\n",
+        "            )\n",
+        "            return response\n",
+        "        except Exception as e:\n",
+        "            raise e\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "7LXN8zaLgOie"
+      },
+      "source": [
+        "# Import Sheet\n",
+        "\n",
+        "\n",
+        "Format: | ID | Name | Max Budget |"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "oiED0usegPGf"
+      },
+      "outputs": [],
+      "source": [
+        "async def import_sheet():\n",
+        "    tasks = []\n",
+        "    http_client = HTTPHandler()\n",
+        "    with open('my-batch-sheet.csv', 'r') as file:\n",
+        "        csv_reader = csv.DictReader(file)\n",
+        "        for row in csv_reader:\n",
+        "            task = create_user(client=http_client, user_id=row['ID'], max_budget=row['Max Budget'], user_name=row['Name'])\n",
+        "            tasks.append(task)\n",
+        "            # print(f\"ID: {row['ID']}, Name: {row['Name']}, Max Budget: {row['Max Budget']}\")\n",
+        "\n",
+        "    keys = await asyncio.gather(*tasks)\n",
+        "\n",
+        "    with open('my-batch-sheet_new.csv', 'w', newline='') as new_file:\n",
+        "        fieldnames = ['ID', 'Name', 'Max Budget', 'keys']\n",
+        "        csv_writer = csv.DictWriter(new_file, fieldnames=fieldnames)\n",
+        "        csv_writer.writeheader()\n",
+        "\n",
+        "        with open('my-batch-sheet.csv', 'r') as file:\n",
+        "            csv_reader = csv.DictReader(file)\n",
+        "            for i, row in enumerate(csv_reader):\n",
+        "                row['keys'] = keys[i]  # Add the 'keys' value from the corresponding task result\n",
+        "                csv_writer.writerow(row)\n",
+        "\n",
+        "    await http_client.close()\n",
+        "\n",
+        "asyncio.run(import_sheet())"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "E7M0Li_UgJeZ"
+      },
+      "source": [
+        "# Create Users + Keys\n",
+        "\n",
+        "- Creates a user\n",
+        "- Creates a key with max budget"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "NZudRFujf7j-"
+      },
+      "outputs": [],
+      "source": [
+        "\n",
+        "async def create_key_with_alias(client: HTTPHandler, user_id: str, max_budget: float):\n",
+        "    global proxy_base_url\n",
+        "    if not proxy_base_url.endswith(\"/\"):\n",
+        "        proxy_base_url += \"/\"\n",
+        "    url = proxy_base_url + \"key/generate\"\n",
+        "\n",
+        "    # call /key/generate\n",
+        "    print(\"CALLING /KEY/GENERATE\")\n",
+        "    response = await client.post(\n",
+        "        url=url,\n",
+        "        headers={\"Authorization\": f\"Bearer {master_key}\"},\n",
+        "        data=json.dumps({\n",
+        "            \"user_id\": user_id,\n",
+        "            \"key_alias\": f\"{user_id}-key\",\n",
+        "            \"max_budget\": max_budget # 👈 KEY CHANGE: SETS MAX BUDGET PER KEY\n",
+        "        })\n",
+        "    )\n",
+        "    print(f\"response: {response.text}\")\n",
+        "    return response.json()[\"key\"]\n",
+        "\n",
+        "async def create_user(client: HTTPHandler, user_id: str, max_budget: float, user_name: str):\n",
+        "    \"\"\"\n",
+        "    - call /user/new\n",
+        "    - create key for user\n",
+        "    \"\"\"\n",
+        "    global proxy_base_url\n",
+        "    if not proxy_base_url.endswith(\"/\"):\n",
+        "        proxy_base_url += \"/\"\n",
+        "    url = proxy_base_url + \"user/new\"\n",
+        "\n",
+        "    # call /user/new\n",
+        "    await client.post(\n",
+        "        url=url,\n",
+        "        headers={\"Authorization\": f\"Bearer {master_key}\"},\n",
+        "        data=json.dumps({\n",
+        "            \"user_id\": user_id,\n",
+        "            \"user_alias\": user_name,\n",
+        "            \"auto_create_key\": False,\n",
+        "            # \"max_budget\": max_budget # 👈 [OPTIONAL] Sets max budget per user (if you want to set a max budget across keys)\n",
+        "        })\n",
+        "    )\n",
+        "\n",
+        "    # create key for user\n",
+        "    return await create_key_with_alias(client=client, user_id=user_id, max_budget=max_budget)\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
--- a/cookbook/liteLLM_IBM_Watsonx.ipynb
+++ b/cookbook/liteLLM_IBM_Watsonx.ipynb
--- a/cookbook/liteLLM_clarifai_Demo.ipynb
+++ b/cookbook/liteLLM_clarifai_Demo.ipynb
@ -0,0 +1,187 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# LiteLLM Clarifai \n",
+    "This notebook walks you through on how to use liteLLM integration of Clarifai and call LLM model from clarifai with response in openAI output format."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Pre-Requisites"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#install necessary packages\n",
+    "!pip install litellm\n",
+    "!pip install clarifai"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To obtain Clarifai Personal Access Token follow the steps mentioned in the [link](https://docs.clarifai.com/clarifai-basics/authentication/personal-access-tokens/)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Set Clarifai Credentials\n",
+    "import os\n",
+    "os.environ[\"CLARIFAI_API_KEY\"]= \"YOUR_CLARIFAI_PAT\" # Clarifai PAT"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Mistral-large"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import litellm\n",
+    "\n",
+    "litellm.set_verbose=False"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mistral large response : ModelResponse(id='chatcmpl-6eed494d-7ae2-4870-b9c2-6a64d50a6151', choices=[Choices(finish_reason='stop', index=1, message=Message(content=\"In the grand tapestry of time, where tales unfold,\\nLies the chronicle of ages, a sight to behold.\\nA tale of empires rising, and kings of old,\\nOf civilizations lost, and stories untold.\\n\\nOnce upon a yesterday, in a time so vast,\\nHumans took their first steps, casting shadows in the past.\\nFrom the cradle of mankind, a journey they embarked,\\nThrough stone and bronze and iron, their skills they sharpened and marked.\\n\\nEgyptians built pyramids, reaching for the skies,\\nWhile Greeks sought wisdom, truth, in philosophies that lie.\\nRoman legions marched, their empire to expand,\\nAnd in the East, the Silk Road joined the world, hand in hand.\\n\\nThe Middle Ages came, with knights in shining armor,\\nFeudal lords and serfs, a time of both clamor and calm order.\\nThen Renaissance bloomed, like a flower in the sun,\\nA rebirth of art and science, a new age had begun.\\n\\nAcross the vast oceans, explorers sailed with courage bold,\\nDiscovering new lands, stories of adventure, untold.\\nIndustrial Revolution churned, progress in its wake,\\nMachines and factories, a whole new world to make.\\n\\nTwo World Wars raged, a testament to man's strife,\\nYet from the ashes rose hope, a renewed will for life.\\nInto the modern era, technology took flight,\\nConnecting every corner, bathed in digital light.\\n\\nHistory, a symphony, a melody of time,\\nA testament to human will, resilience so sublime.\\nIn every page, a lesson, in every tale, a guide,\\nFor understanding our past, shapes our future's tide.\", role='assistant'))], created=1713896412, model='https://api.clarifai.com/v2/users/mistralai/apps/completion/models/mistral-large/outputs', object='chat.completion', system_fingerprint=None, usage=Usage(prompt_tokens=13, completion_tokens=338, total_tokens=351))\n"
+     ]
+    }
+   ],
+   "source": [
+    "from litellm import completion\n",
+    "\n",
+    "messages = [{\"role\": \"user\",\"content\": \"\"\"Write a poem about history?\"\"\"}]\n",
+    "response=completion(\n",
+    "            model=\"clarifai/mistralai.completion.mistral-large\",\n",
+    "            messages=messages,\n",
+    "        )\n",
+    "\n",
+    "print(f\"Mistral large response : {response}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Claude-2.1 "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Claude-2.1 response : ModelResponse(id='chatcmpl-d126c919-4db4-4aa3-ac8f-7edea41e0b93', choices=[Choices(finish_reason='stop', index=1, message=Message(content=\" Here's a poem I wrote about history:\\n\\nThe Tides of Time\\n\\nThe tides of time ebb and flow,\\nCarrying stories of long ago.\\nFigures and events come into light,\\nShaping the future with all their might.\\n\\nKingdoms rise, empires fall, \\nLeaving traces that echo down every hall.\\nRevolutions bring change with a fiery glow,\\nToppling structures from long ago.\\n\\nExplorers traverse each ocean and land,\\nSeeking treasures they don't understand.\\nWhile artists and writers try to make their mark,\\nHoping their works shine bright in the dark.\\n\\nThe cycle repeats again and again,\\nAs humanity struggles to learn from its pain.\\nThough the players may change on history's stage,\\nThe themes stay the same from age to age.\\n\\nWar and peace, life and death,\\nLove and strife with every breath.\\nThe tides of time continue their dance,\\nAs we join in, by luck or by chance.\\n\\nSo we study the past to light the way forward, \\nHeeding warnings from stories told and heard.\\nThe future unfolds from this unending flow -\\nWhere the tides of time ultimately go.\", role='assistant'))], created=1713896579, model='https://api.clarifai.com/v2/users/anthropic/apps/completion/models/claude-2_1/outputs', object='chat.completion', system_fingerprint=None, usage=Usage(prompt_tokens=12, completion_tokens=232, total_tokens=244))\n"
+     ]
+    }
+   ],
+   "source": [
+    "from litellm import completion\n",
+    "\n",
+    "messages = [{\"role\": \"user\",\"content\": \"\"\"Write a poem about history?\"\"\"}]\n",
+    "response=completion(\n",
+    "            model=\"clarifai/anthropic.completion.claude-2_1\",\n",
+    "            messages=messages,\n",
+    "        )\n",
+    "\n",
+    "print(f\"Claude-2.1 response : {response}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### OpenAI GPT-4 (Streaming)\n",
+    "Though clarifai doesn't support streaming, still you can call stream and get the response in standard StreamResponse format of liteLLM"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ModelResponse(id='chatcmpl-40ae19af-3bf0-4eb4-99f2-33aec3ba84af', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=\"In the quiet corners of time's grand hall,\\nLies the tale of rise and fall.\\nFrom ancient ruins to modern sprawl,\\nHistory, the greatest story of them all.\\n\\nEmpires have risen, empires have decayed,\\nThrough the eons, memories have stayed.\\nIn the book of time, history is laid,\\nA tapestry of events, meticulously displayed.\\n\\nThe pyramids of Egypt, standing tall,\\nThe Roman Empire's mighty sprawl.\\nFrom Alexander's conquest, to the Berlin Wall,\\nHistory, a silent witness to it all.\\n\\nIn the shadow of the past we tread,\\nWhere once kings and prophets led.\\nTheir stories in our hearts are spread,\\nEchoes of their words, in our minds are read.\\n\\nBattles fought and victories won,\\nActs of courage under the sun.\\nTales of love, of deeds done,\\nIn history's grand book, they all run.\\n\\nHeroes born, legends made,\\nIn the annals of time, they'll never fade.\\nTheir triumphs and failures all displayed,\\nIn the eternal march of history's parade.\\n\\nThe ink of the past is forever dry,\\nBut its lessons, we cannot deny.\\nIn its stories, truths lie,\\nIn its wisdom, we rely.\\n\\nHistory, a mirror to our past,\\nA guide for the future vast.\\nThrough its lens, we're ever cast,\\nIn the drama of life, forever vast.\", role='assistant', function_call=None, tool_calls=None), logprobs=None)], created=1714744515, model='https://api.clarifai.com/v2/users/openai/apps/chat-completion/models/GPT-4/outputs', object='chat.completion.chunk', system_fingerprint=None)\n",
+      "ModelResponse(id='chatcmpl-40ae19af-3bf0-4eb4-99f2-33aec3ba84af', choices=[StreamingChoices(finish_reason='stop', index=0, delta=Delta(content=None, role=None, function_call=None, tool_calls=None), logprobs=None)], created=1714744515, model='https://api.clarifai.com/v2/users/openai/apps/chat-completion/models/GPT-4/outputs', object='chat.completion.chunk', system_fingerprint=None)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from litellm import completion\n",
+    "\n",
+    "messages = [{\"role\": \"user\",\"content\": \"\"\"Write a poem about history?\"\"\"}]\n",
+    "response = completion(\n",
+    "                model=\"clarifai/openai.chat-completion.GPT-4\",\n",
+    "                messages=messages,\n",
+    "                stream=True,\n",
+    "                api_key = \"c75cc032415e45368be331fdd2c06db0\")\n",
+    "\n",
+    "for chunk in response:\n",
+    "  print(chunk)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/deploy/azure_resource_manager/azure_marketplace.zip
+++ b/deploy/azure_resource_manager/azure_marketplace.zip
--- a/deploy/azure_resource_manager/azure_marketplace/createUiDefinition.json
+++ b/deploy/azure_resource_manager/azure_marketplace/createUiDefinition.json
@ -0,0 +1,15 @@
+{
+    "$schema": "https://schema.management.azure.com/schemas/0.1.2-preview/CreateUIDefinition.MultiVm.json#",
+    "handler": "Microsoft.Azure.CreateUIDef",
+    "version": "0.1.2-preview",
+    "parameters": {
+        "config": {
+            "isWizard": false,
+            "basics": { }
+        },
+        "basics": [ ],
+        "steps": [ ],
+        "outputs": { },
+        "resourceTypes": [ ]
+    }
+}
--- a/deploy/azure_resource_manager/azure_marketplace/mainTemplate.json
+++ b/deploy/azure_resource_manager/azure_marketplace/mainTemplate.json
@ -0,0 +1,63 @@
+{
+    "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
+    "contentVersion": "1.0.0.0",
+    "parameters": {
+      "imageName": {
+        "type": "string",
+        "defaultValue": "ghcr.io/berriai/litellm:main-latest"
+      },
+      "containerName": {
+        "type": "string",
+        "defaultValue": "litellm-container"
+      },
+      "dnsLabelName": {
+        "type": "string",
+        "defaultValue": "litellm"
+      },
+      "portNumber": {
+        "type": "int",
+        "defaultValue": 4000
+      }
+    },
+    "resources": [
+      {
+        "type": "Microsoft.ContainerInstance/containerGroups",
+        "apiVersion": "2021-03-01",
+        "name": "[parameters('containerName')]",
+        "location": "[resourceGroup().location]",
+        "properties": {
+          "containers": [
+            {
+              "name": "[parameters('containerName')]",
+              "properties": {
+                "image": "[parameters('imageName')]",
+                "resources": {
+                  "requests": {
+                    "cpu": 1,
+                    "memoryInGB": 2
+                  }
+                },
+                "ports": [
+                  {
+                    "port": "[parameters('portNumber')]"
+                  }
+                ]
+              }
+            }
+          ],
+          "osType": "Linux",
+          "restartPolicy": "Always",
+          "ipAddress": {
+            "type": "Public",
+            "ports": [
+              {
+                "protocol": "tcp",
+                "port": "[parameters('portNumber')]"
+              }
+            ],
+            "dnsNameLabel": "[parameters('dnsLabelName')]"
+          }
+        }
+      }
+    ]
+  }
--- a/deploy/azure_resource_manager/main.bicep
+++ b/deploy/azure_resource_manager/main.bicep
@ -0,0 +1,42 @@
+param imageName string = 'ghcr.io/berriai/litellm:main-latest'
+param containerName string = 'litellm-container'
+param dnsLabelName string = 'litellm'
+param portNumber int = 4000
+
+resource containerGroupName 'Microsoft.ContainerInstance/containerGroups@2021-03-01' = {
+  name: containerName
+  location: resourceGroup().location
+  properties: {
+    containers: [
+      {
+        name: containerName
+        properties: {
+          image: imageName
+          resources: {
+            requests: {
+              cpu: 1
+              memoryInGB: 2
+            }
+          }
+          ports: [
+            {
+              port: portNumber
+            }
+          ]
+        }
+      }
+    ]
+    osType: 'Linux'
+    restartPolicy: 'Always'
+    ipAddress: {
+      type: 'Public'
+      ports: [
+        {
+          protocol: 'tcp'
+          port: portNumber
+        }
+      ]
+      dnsNameLabel: dnsLabelName
+    }
+  }
+}
--- a/deploy/charts/litellm-helm/Chart.yaml
+++ b/deploy/charts/litellm-helm/Chart.yaml
@ -24,7 +24,7 @@ version: 0.2.0
 # incremented each time you make changes to the application. Versions are not expected to
 # follow Semantic Versioning. They should reflect the version the application is using.
 # It is recommended to use it with quotes.
-appVersion: v1.24.5
+appVersion: v1.35.38

 dependencies:
  - name: "postgresql"
--- a/docs/my-website/docs/caching/redis_cache.md
+++ b/docs/my-website/docs/caching/redis_cache.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# Caching - In-Memory, Redis, s3,  Redis Semantic Cache
+# Caching - In-Memory, Redis, s3, Redis Semantic Cache, Disk

 [**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/caching.py)

@ -11,7 +11,7 @@ Need to use Caching on LiteLLM Proxy Server? Doc here: [Caching Proxy Server](ht

 :::

-## Initialize Cache - In Memory, Redis, s3 Bucket, Redis Semantic Cache
+## Initialize Cache - In Memory, Redis, s3 Bucket, Redis Semantic, Disk Cache


 <Tabs>
@ -159,7 +159,7 @@ litellm.cache = Cache()
 # Make completion calls
 response1 = completion(
    model="gpt-3.5-turbo", 
-    messages=[{"role": "user", "content": "Tell me a joke."}]
+    messages=[{"role": "user", "content": "Tell me a joke."}],
    caching=True
 )
 response2 = completion(
@ -174,6 +174,43 @@ response2 = completion(

 </TabItem>

+<TabItem value="disk" label="disk cache">
+
+### Quick Start
+
+Install diskcache:
+
+```shell
+pip install diskcache
+```
+
+Then you can use the disk cache as follows.
+
+```python
+import litellm
+from litellm import completion
+from litellm.caching import Cache
+litellm.cache = Cache(type="disk")
+
+# Make completion calls
+response1 = completion(
+    model="gpt-3.5-turbo", 
+    messages=[{"role": "user", "content": "Tell me a joke."}],
+    caching=True
+)
+response2 = completion(
+    model="gpt-3.5-turbo", 
+    messages=[{"role": "user", "content": "Tell me a joke."}],
+    caching=True
+)
+
+# response1 == response2, response 1 is cached
+
+```
+
+If you run the code two times, response1 will use the cache from the first run that was stored in a cache file.
+
+</TabItem>

 </Tabs>

@ -191,13 +228,13 @@ Advanced Params

 ```python
 litellm.enable_cache(
-    type: Optional[Literal["local", "redis"]] = "local",
+    type: Optional[Literal["local", "redis", "s3", "disk"]] = "local",
    host: Optional[str] = None,
    port: Optional[str] = None,
    password: Optional[str] = None,
    supported_call_types: Optional[
-        List[Literal["completion", "acompletion", "embedding", "aembedding"]]
-    ] = ["completion", "acompletion", "embedding", "aembedding"],
+        List[Literal["completion", "acompletion", "embedding", "aembedding", "atranscription", "transcription"]]
+    ] = ["completion", "acompletion", "embedding", "aembedding", "atranscription", "transcription"],
    **kwargs,
 )
 ```
@ -215,13 +252,13 @@ Update the Cache params

 ```python
 litellm.update_cache(
-    type: Optional[Literal["local", "redis"]] = "local",
+    type: Optional[Literal["local", "redis", "s3", "disk"]] = "local",
    host: Optional[str] = None,
    port: Optional[str] = None,
    password: Optional[str] = None,
    supported_call_types: Optional[
-        List[Literal["completion", "acompletion", "embedding", "aembedding"]]
-    ] = ["completion", "acompletion", "embedding", "aembedding"],
+        List[Literal["completion", "acompletion", "embedding", "aembedding", "atranscription", "transcription"]]
+    ] = ["completion", "acompletion", "embedding", "aembedding", "atranscription", "transcription"],
    **kwargs,
 )
 ```
@ -276,22 +313,29 @@ cache.get_cache = get_cache
 ```python
 def __init__(
    self,
-    type: Optional[Literal["local", "redis", "s3"]] = "local",
+    type: Optional[Literal["local", "redis", "redis-semantic", "s3", "disk"]] = "local",
    supported_call_types: Optional[
-        List[Literal["completion", "acompletion", "embedding", "aembedding"]]
-    ] = ["completion", "acompletion", "embedding", "aembedding"], # A list of litellm call types to cache for. Defaults to caching for all litellm call types.
+        List[Literal["completion", "acompletion", "embedding", "aembedding", "atranscription", "transcription"]]
+    ] = ["completion", "acompletion", "embedding", "aembedding", "atranscription", "transcription"],
+    ttl: Optional[float] = None,
+    default_in_memory_ttl: Optional[float] = None,

    # redis cache params
    host: Optional[str] = None,
    port: Optional[str] = None,
    password: Optional[str] = None,
-
+    namespace: Optional[str] = None,
+    default_in_redis_ttl: Optional[float] = None,
+    similarity_threshold: Optional[float] = None,
+    redis_semantic_cache_use_async=False,
+    redis_semantic_cache_embedding_model="text-embedding-ada-002",
+    redis_flush_size=None,

    # s3 Bucket, boto3 configuration
    s3_bucket_name: Optional[str] = None,
    s3_region_name: Optional[str] = None,
    s3_api_version: Optional[str] = None,
-    s3_path: Optional[str] = None, # if you wish to save to a spefic path
+    s3_path: Optional[str] = None, # if you wish to save to a specific path
    s3_use_ssl: Optional[bool] = True,
    s3_verify: Optional[Union[bool, str]] = None,
    s3_endpoint_url: Optional[str] = None,
@ -299,7 +343,11 @@ def __init__(
    s3_aws_secret_access_key: Optional[str] = None,
    s3_aws_session_token: Optional[str] = None,
    s3_config: Optional[Any] = None,
-    **kwargs,
+
+    # disk cache params
+    disk_cache_dir=None,
+
+    **kwargs
 ):
 ```

--- a/docs/my-website/docs/caching/local_caching.md
+++ b/docs/my-website/docs/caching/local_caching.md
@ -40,7 +40,7 @@ cache = Cache()

 cache.add_cache(cache_key="test-key", result="1234")

-cache.get_cache(cache_key="test-key)
+cache.get_cache(cache_key="test-key")
 ```

 ## Caching with Streaming 
--- a/docs/my-website/docs/completion/batching.md
+++ b/docs/my-website/docs/completion/batching.md
@ -4,6 +4,12 @@ LiteLLM allows you to:
 * Send 1 completion call to many models: Return Fastest Response
 * Send 1 completion call to many models: Return All Responses

+:::info
+
+Trying to do batch completion on LiteLLM Proxy ? Go here: https://docs.litellm.ai/docs/proxy/user_keys#beta-batch-completions---pass-model-as-list
+
+:::
+
 ## Send multiple completion calls to 1 model

 In the batch_completion method, you provide a list of `messages` where each sub-list of messages is passed to `litellm.completion()`, allowing you to process multiple prompts efficiently in a single API call.
--- a/docs/my-website/docs/completion/input.md
+++ b/docs/my-website/docs/completion/input.md
@ -37,11 +37,12 @@ print(response) # ["max_tokens", "tools", "tool_choice", "stream"]

 This is a list of openai params we translate across providers.

-This list is constantly being updated.
+Use `litellm.get_supported_openai_params()` for an updated list of params for each model + provider 

 | Provider | temperature | max_tokens | top_p | stream | stop | n | presence_penalty | frequency_penalty | functions | function_call | logit_bias | user | response_format | seed | tools | tool_choice | logprobs | top_logprobs | extra_headers | 
 |---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|--|
-|Anthropic| ✅ | ✅ | ✅ | ✅ | ✅ |  |  |   |  |   |
+|Anthropic| ✅ | ✅ | ✅ | ✅ | ✅ |  |  |   |  |  |  |  |  |  | ✅ | ✅ | 
+|Anthropic| ✅ | ✅ | ✅ | ✅ | ✅ |  |  |   |  |  |  |  | ✅ | ✅ | ✅ | ✅ | 
 |OpenAI| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ | ✅ |
 |Azure OpenAI| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ |✅ | ✅ |  |  | ✅ |
 |Replicate | ✅ | ✅ | ✅ | ✅ | ✅ | |  |   |  |   |
@ -83,8 +84,9 @@ def completion(
    top_p: Optional[float] = None,
    n: Optional[int] = None,
    stream: Optional[bool] = None,
+    stream_options: Optional[dict] = None,
    stop=None,
-    max_tokens: Optional[float] = None,
+    max_tokens: Optional[int] = None,
    presence_penalty: Optional[float] = None,
    frequency_penalty: Optional[float] = None,
    logit_bias: Optional[dict] = None,
@ -139,6 +141,10 @@ def completion(

 - `stream`: *boolean or null (optional)* - If set to true, it sends partial message deltas. Tokens will be sent as they become available, with the stream terminated by a [DONE] message.

+- `stream_options` *dict or null (optional)* - Options for streaming response. Only set this when you set `stream: true`
+
+    - `include_usage` *boolean (optional)* - If set, an additional chunk will be streamed before the data: [DONE] message. The usage field on this chunk shows the token usage statistics for the entire request, and the choices field will always be an empty array. All other chunks will also include a usage field, but with a null value. 
+
 - `stop`: *string/ array/ null (optional)* - Up to 4 sequences where the API will stop generating further tokens.

 - `max_tokens`: *integer (optional)* - The maximum number of tokens to generate in the chat completion.
--- a/docs/my-website/docs/completion/token_usage.md
+++ b/docs/my-website/docs/completion/token_usage.md
@ -1,7 +1,7 @@
 # Completion Token Usage & Cost
 By default LiteLLM returns token usage in all completion requests ([See here](https://litellm.readthedocs.io/en/latest/output/))

-However, we also expose 5 helper functions + **[NEW]** an API to calculate token usage across providers:
+However, we also expose some helper functions + **[NEW]** an API to calculate token usage across providers:

 - `encode`: This encodes the text passed in, using the model-specific tokenizer. [**Jump to code**](#1-encode)

@ -9,17 +9,19 @@ However, we also expose 5 helper functions + **[NEW]** an API to calculate token

 - `token_counter`: This returns the number of tokens for a given input - it uses the tokenizer based on the model, and defaults to tiktoken if no model-specific tokenizer is available. [**Jump to code**](#3-token_counter)

- `cost_per_token`: This returns the cost (in USD) for prompt (input) and completion (output) tokens. Uses the live list from `api.litellm.ai`. [**Jump to code**](#4-cost_per_token)
+- `create_pretrained_tokenizer` and `create_tokenizer`: LiteLLM provides default tokenizer support for OpenAI, Cohere, Anthropic, Llama2, and Llama3 models. If you are using a different model, you can create a custom tokenizer and pass it as `custom_tokenizer` to the `encode`, `decode`, and `token_counter` methods. [**Jump to code**](#4-create_pretrained_tokenizer-and-create_tokenizer)

- `completion_cost`: This returns the overall cost (in USD) for a given LLM API Call. It combines `token_counter` and `cost_per_token` to return the cost for that query (counting both cost of input and output). [**Jump to code**](#5-completion_cost)
+- `cost_per_token`: This returns the cost (in USD) for prompt (input) and completion (output) tokens. Uses the live list from `api.litellm.ai`. [**Jump to code**](#5-cost_per_token)

- `get_max_tokens`: This returns the maximum number of tokens allowed for the given model. [**Jump to code**](#6-get_max_tokens)
+- `completion_cost`: This returns the overall cost (in USD) for a given LLM API Call. It combines `token_counter` and `cost_per_token` to return the cost for that query (counting both cost of input and output). [**Jump to code**](#6-completion_cost)

- `model_cost`: This returns a dictionary for all models, with their max_tokens, input_cost_per_token and output_cost_per_token. It uses the `api.litellm.ai` call shown below. [**Jump to code**](#7-model_cost)
+- `get_max_tokens`: This returns the maximum number of tokens allowed for the given model. [**Jump to code**](#7-get_max_tokens)

- `register_model`: This registers new / overrides existing models (and their pricing details) in the model cost dictionary. [**Jump to code**](#8-register_model)
+- `model_cost`: This returns a dictionary for all models, with their max_tokens, input_cost_per_token and output_cost_per_token. It uses the `api.litellm.ai` call shown below. [**Jump to code**](#8-model_cost)

- `api.litellm.ai`: Live token + price count across [all supported models](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json). [**Jump to code**](#9-apilitellmai)
+- `register_model`: This registers new / overrides existing models (and their pricing details) in the model cost dictionary. [**Jump to code**](#9-register_model)
+
+- `api.litellm.ai`: Live token + price count across [all supported models](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json). [**Jump to code**](#10-apilitellmai)

 📣 This is a community maintained list. Contributions are welcome! ❤️

@ -60,7 +62,24 @@ messages = [{"user": "role", "content": "Hey, how's it going"}]
 print(token_counter(model="gpt-3.5-turbo", messages=messages))
 ```

-### 4. `cost_per_token`
+### 4. `create_pretrained_tokenizer` and `create_tokenizer`
+
+```python
+from litellm import create_pretrained_tokenizer, create_tokenizer
+
+# get tokenizer from huggingface repo
+custom_tokenizer_1 = create_pretrained_tokenizer("Xenova/llama-3-tokenizer")
+
+# use tokenizer from json file
+with open("tokenizer.json") as f:
+    json_data = json.load(f)
+
+json_str = json.dumps(json_data)
+
+custom_tokenizer_2 = create_tokenizer(json_str)
+```
+
+### 5. `cost_per_token`

 ```python
 from litellm import cost_per_token
@ -72,7 +91,7 @@ prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = cost_per_toke
 print(prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar)
 ```

-### 5. `completion_cost`
+### 6. `completion_cost`

 * Input: Accepts a `litellm.completion()` response **OR** prompt + completion strings
 * Output: Returns a `float` of cost for the `completion` call 
@ -99,7 +118,7 @@ cost = completion_cost(model="bedrock/anthropic.claude-v2", prompt="Hey!", compl
 formatted_string = f"${float(cost):.10f}"
 print(formatted_string)
 ```
-### 6. `get_max_tokens`
+### 7. `get_max_tokens`

 Input: Accepts a model name - e.g., gpt-3.5-turbo (to get a complete list, call litellm.model_list).
 Output: Returns the maximum number of tokens allowed for the given model
@ -112,7 +131,7 @@ model = "gpt-3.5-turbo"
 print(get_max_tokens(model)) # Output: 4097
 ```

-### 7. `model_cost`
+### 8. `model_cost`

 * Output: Returns a dict object containing the max_tokens, input_cost_per_token, output_cost_per_token for all models on [community-maintained list](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)

@ -122,7 +141,7 @@ from litellm import model_cost
 print(model_cost) # {'gpt-3.5-turbo': {'max_tokens': 4000, 'input_cost_per_token': 1.5e-06, 'output_cost_per_token': 2e-06}, ...}
 ```

-### 8. `register_model`
+### 9. `register_model`

 * Input: Provide EITHER a model cost dictionary or a url to a hosted json blob
 * Output: Returns updated model_cost dictionary + updates litellm.model_cost with model details.  
@ -157,5 +176,3 @@ export LITELLM_LOCAL_MODEL_COST_MAP="True"
 ```

 Note: this means you will need to upgrade to get updated pricing, and newer models. 
-
-
--- a/docs/my-website/docs/completion/vision.md
+++ b/docs/my-website/docs/completion/vision.md
@ -0,0 +1,45 @@
+# Using Vision Models
+
+## Quick Start
+Example passing images to a model 
+
+```python
+import os 
+from litellm import completion
+
+os.environ["OPENAI_API_KEY"] = "your-api-key"
+
+# openai call
+response = completion(
+    model = "gpt-4-vision-preview", 
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                            {
+                                "type": "text",
+                                "text": "What’s in this image?"
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+                                }
+                            }
+                        ]
+        }
+    ],
+)
+
+```
+
+## Checking if a model supports `vision`
+
+Use `litellm.supports_vision(model="")` -> returns `True` if model supports `vision` and `False` if not
+
+```python
+assert litellm.supports_vision(model="gpt-4-vision-preview") == True
+assert litellm.supports_vision(model="gemini-1.0-pro-visionn") == True
+assert litellm.supports_vision(model="gpt-3.5-turbo") == False
+```
+
--- a/docs/my-website/docs/debugging/local_debugging.md
+++ b/docs/my-website/docs/debugging/local_debugging.md
@ -23,6 +23,14 @@ response = completion(model="gpt-3.5-turbo", messages=messages)
 response = completion("command-nightly", messages)
 ```

+## JSON Logs 
+
+If you need to store the logs as JSON, just set the `litellm.json_logs = True`.
+
+We currently just log the raw POST request from litellm as a JSON - [**See Code**]. 
+
+[Share feedback here](https://github.com/BerriAI/litellm/issues)
+
 ## Logger Function 
 But sometimes all you care about is seeing exactly what's getting sent to your api call and what's being returned - e.g. if the api call is failing, why is that happening? what are the exact params being set? 

--- a/docs/my-website/docs/embedding/supported_embedding.md
+++ b/docs/my-website/docs/embedding/supported_embedding.md
@ -320,8 +320,6 @@ from litellm import embedding
 litellm.vertex_project = "hardy-device-38811" # Your Project ID
 litellm.vertex_location = "us-central1"  # proj location

-
-os.environ['VOYAGE_API_KEY'] = ""
 response = embedding(
    model="vertex_ai/textembedding-gecko",
    input=["good morning from litellm"],
@ -339,6 +337,8 @@ All models listed [here](https://github.com/BerriAI/litellm/blob/57f37f743886a02
 | textembedding-gecko-multilingual@001 | `embedding(model="vertex_ai/textembedding-gecko-multilingual@001", input)` | 
 | textembedding-gecko@001 | `embedding(model="vertex_ai/textembedding-gecko@001", input)` | 
 | textembedding-gecko@003 | `embedding(model="vertex_ai/textembedding-gecko@003", input)` | 
+| text-embedding-preview-0409 | `embedding(model="vertex_ai/text-embedding-preview-0409", input)` |
+| text-multilingual-embedding-preview-0409 | `embedding(model="vertex_ai/text-multilingual-embedding-preview-0409", input)` | 

 ## Voyage AI Embedding Models

--- a/docs/my-website/docs/enterprise.md
+++ b/docs/my-website/docs/enterprise.md
@ -8,14 +8,23 @@ For companies that need SSO, user management and professional support for LiteLL
 :::

 This covers: 
- ✅ **Features under the [LiteLLM Commercial License](https://docs.litellm.ai/docs/proxy/enterprise):**
+- ✅ **Features under the [LiteLLM Commercial License (Content Mod, Custom Tags, etc.)](https://docs.litellm.ai/docs/proxy/enterprise)**
 - ✅ **Feature Prioritization**
 - ✅ **Custom Integrations**
 - ✅ **Professional Support - Dedicated discord + slack**
 - ✅ **Custom SLAs**
- ✅ **Secure access with Single Sign-On**
+- ✅ [**Secure UI access with Single Sign-On**](../docs/proxy/ui.md#setup-ssoauth-for-ui)
+- ✅ [**JWT-Auth**](../docs/proxy/token_auth.md)


+## [COMING SOON] AWS Marketplace Support
+
+Deploy managed LiteLLM Proxy within your VPC.
+
+Includes all enterprise features.
+
+[**Get early access**](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
+
 ## Frequently Asked Questions

 ### What topics does Professional support cover and what SLAs do you offer?
--- a/docs/my-website/docs/exception_mapping.md
+++ b/docs/my-website/docs/exception_mapping.md
@ -13,7 +13,7 @@ LiteLLM maps exceptions across all providers to their OpenAI counterparts.
 | >=500       | InternalServerError      |
 | N/A         | ContextWindowExceededError|
 | 400         | ContentPolicyViolationError|
-| N/A         | APIConnectionError       |
+| 500         | APIConnectionError       |


 Base case we return APIConnectionError
@ -74,6 +74,28 @@ except Exception as e:

 ```

+## Usage - Should you retry exception? 
+
+```
+import litellm
+import openai
+
+try:
+    response = litellm.completion(
+                model="gpt-4",
+                messages=[
+                    {
+                        "role": "user",
+                        "content": "hello, write a 20 pageg essay"
+                    }
+                ],
+                timeout=0.01, # this will raise a timeout exception
+            )
+except openai.APITimeoutError as e:
+    should_retry = litellm._should_retry(e.status_code)
+    print(f"should_retry: {should_retry}")
+```
+
 ## Details 

 To see how it's implemented - [check out the code](https://github.com/BerriAI/litellm/blob/a42c197e5a6de56ea576c73715e6c7c6b19fa249/litellm/utils.py#L1217)
@ -84,23 +106,37 @@ To see how it's implemented - [check out the code](https://github.com/BerriAI/li

 ## Custom mapping list

-Base case - we return the original exception.
+Base case - we return `litellm.APIConnectionError` exception (inherits from openai's APIConnectionError exception).

-|               | ContextWindowExceededError | AuthenticationError | InvalidRequestError | RateLimitError | ServiceUnavailableError |
-|---------------|----------------------------|---------------------|---------------------|---------------|-------------------------|
-| Anthropic     | ✅                          | ✅                   | ✅                   | ✅             |                         |
-| OpenAI        | ✅                          | ✅                     |✅                     |✅               |✅|
-| Azure OpenAI        | ✅                          | ✅                     |✅                     |✅               |✅|
-| Replicate     | ✅                          | ✅                   | ✅                   | ✅             | ✅                       |
-| Cohere        | ✅                          | ✅                   | ✅                    | ✅             | ✅                        |
-| Huggingface   | ✅                          | ✅                   | ✅                   | ✅             |                         |
-| Openrouter    | ✅                          | ✅                   | ✅                    | ✅             |                         |
-| AI21          | ✅                          | ✅                   | ✅                   | ✅             |                         |
-| VertexAI          |                           |                   |✅                   |             |                         |
-| Bedrock          |                           |                   |✅                   |             |                         |
-| Sagemaker          |                           |                   |✅                   |             |                         |
-| TogetherAI    | ✅                          | ✅                   | ✅                   | ✅             |                         |
-| AlephAlpha    | ✅                          | ✅                   | ✅                   | ✅             | ✅                        |
+| custom_llm_provider        | Timeout | ContextWindowExceededError | BadRequestError | NotFoundError | ContentPolicyViolationError | AuthenticationError | APIError | RateLimitError | ServiceUnavailableError | PermissionDeniedError | UnprocessableEntityError |
+|----------------------------|---------|----------------------------|------------------|---------------|-----------------------------|---------------------|----------|----------------|-------------------------|-----------------------|-------------------------|
+| openai                     | ✓       | ✓                          | ✓                |               | ✓                           | ✓                   |          |                |                         |                       |                           |
+| watsonx                     |       | | | | | | |✓| | | |
+| text-completion-openai     | ✓       | ✓                          | ✓                |               | ✓                           | ✓                   |          |                |                         |                       |                           |
+| custom_openai              | ✓       | ✓                          | ✓                |               | ✓                           | ✓                   |          |                |                         |                       |                           |
+| openai_compatible_providers| ✓       | ✓                          | ✓                |               | ✓                           | ✓                   |          |                |                         |                       |                           |
+| anthropic                  | ✓       | ✓                          | ✓                | ✓             |                             | ✓                   |          |                | ✓                       | ✓                     |                           |
+| replicate                  | ✓       | ✓                          | ✓                | ✓             |                             | ✓                   |          | ✓              | ✓                       |                       |                           |
+| bedrock                    | ✓       | ✓                          | ✓                | ✓             |                             | ✓                   |          | ✓              | ✓                       | ✓                     |                           |
+| sagemaker                  |         | ✓                          | ✓                |               |                             |                     |          |                |                         |                       |                           |
+| vertex_ai                  | ✓       |                            | ✓                |               |                             |                     | ✓        |                |                         |                       | ✓                         |
+| palm                       | ✓       | ✓                          |                  |               |                             |                     | ✓        |                |                         |                       |                           |
+| gemini                     | ✓       | ✓                          |                  |               |                             |                     | ✓        |                |                         |                       |                           |
+| cloudflare                 |         |                            | ✓                |               |                             | ✓                   |          |                |                         |                       |                           |
+| cohere                     |         | ✓                          | ✓                |               |                             | ✓                   |          |                | ✓                       |                       |                           |
+| cohere_chat                |         | ✓                          | ✓                |               |                             | ✓                   |          |                | ✓                       |                       |                           |
+| huggingface                | ✓       | ✓                          | ✓                |               |                             | ✓                   |          | ✓              | ✓                       |                       |                           |
+| ai21                       | ✓       | ✓                          | ✓                | ✓             |                             | ✓                   |          | ✓              |                         |                       |                           |
+| nlp_cloud                  | ✓       | ✓                          | ✓                |               |                             | ✓                   | ✓        | ✓              | ✓                       |                       |                           |
+| together_ai                | ✓       | ✓                          | ✓                |               |                             | ✓                   |          |                |                         |                       |                           |
+| aleph_alpha                |         |                            | ✓                |               |                             | ✓                   |          |                |                         |                       |                           |
+| ollama                     | ✓       |                            | ✓                |               |                             |                     |          |                | ✓                       |                       |                           |
+| ollama_chat                | ✓       |                            | ✓                |               |                             |                     |          |                | ✓                       |                       |                           |
+| vllm                       |         |                            |                  |               |                             | ✓                   | ✓        |                |                         |                       |                           |
+| azure                      | ✓       | ✓                          | ✓                | ✓             | ✓                           | ✓                   |          |                | ✓                       |                       |                           |
+
+- "✓" indicates that the specified `custom_llm_provider` can raise the corresponding exception.
+- Empty cells indicate the lack of association or that the provider does not raise that particular exception type as indicated by the function.


 > For a deeper understanding of these exceptions, you can check out [this](https://github.com/BerriAI/litellm/blob/d7e58d13bf9ba9edbab2ab2f096f3de7547f35fa/litellm/utils.py#L1544) implementation for additional insights.
--- a/docs/my-website/docs/hosted.md
+++ b/docs/my-website/docs/hosted.md
@ -0,0 +1,58 @@
+import Image from '@theme/IdealImage';
+
+# Hosted LiteLLM Proxy
+
+LiteLLM maintains the proxy, so you can focus on your core products. 
+
+## [**Get Onboarded**](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
+
+This is in alpha. Schedule a call with us, and we'll give you a hosted proxy within 30 minutes. 
+
+[**🚨 Schedule Call**](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
+
+### **Status**: Alpha 
+
+Our proxy is already used in production by customers. 
+
+See our status page for [**live reliability**](https://status.litellm.ai/)
+
+### **Benefits**
+- **No Maintenance, No Infra**: We'll maintain the proxy, and spin up any additional infrastructure (e.g.: separate server for spend logs) to make sure you can load balance + track spend across multiple LLM projects. 
+- **Reliable**: Our hosted proxy is tested on 1k requests per second, making it reliable for high load.
+- **Secure**: LiteLLM is currently undergoing SOC-2 compliance, to make sure your data is as secure as possible.
+
+### Pricing
+
+Pricing is based on usage. We can figure out a price that works for your team, on the call. 
+
+[**🚨 Schedule Call**](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
+
+## **Screenshots**
+
+### 1. Create keys
+
+<Image img={require('../img/litellm_hosted_ui_create_key.png')} />
+
+### 2. Add Models
+
+<Image img={require('../img/litellm_hosted_ui_add_models.png')}/>
+
+### 3. Track spend 
+
+<Image img={require('../img/litellm_hosted_usage_dashboard.png')} />
+
+
+### 4. Configure load balancing 
+
+<Image img={require('../img/litellm_hosted_ui_router.png')} />
+
+#### [**🚨 Schedule Call**](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
+
+## Feature List 
+
+- Easy way to add/remove models
+- 100% uptime even when models are added/removed
+- custom callback webhooks
+- your domain name with HTTPS
+- Ability to create/delete User API keys
+- Reasonable set monthly cost
--- a/docs/my-website/docs/langchain/langchain.md
+++ b/docs/my-website/docs/langchain/langchain.md
@ -14,14 +14,14 @@ import TabItem from '@theme/TabItem';

 ```python
 import os
-from langchain.chat_models import ChatLiteLLM
-from langchain.prompts.chat import (
+from langchain_community.chat_models import ChatLiteLLM
+from langchain_core.prompts import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
 )
-from langchain.schema import AIMessage, HumanMessage, SystemMessage
+from langchain_core.messages import AIMessage, HumanMessage, SystemMessage

 os.environ['OPENAI_API_KEY'] = ""
 chat = ChatLiteLLM(model="gpt-3.5-turbo")
@ -30,7 +30,7 @@ messages = [
        content="what model are you"
    )
 ]
-chat(messages)
+chat.invoke(messages)
 ```

 </TabItem>
@ -39,14 +39,14 @@ chat(messages)

 ```python
 import os
-from langchain.chat_models import ChatLiteLLM
-from langchain.prompts.chat import (
+from langchain_community.chat_models import ChatLiteLLM
+from langchain_core.prompts import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
 )
-from langchain.schema import AIMessage, HumanMessage, SystemMessage
+from langchain_core.messages import AIMessage, HumanMessage, SystemMessage

 os.environ['ANTHROPIC_API_KEY'] = ""
 chat = ChatLiteLLM(model="claude-2", temperature=0.3)
@ -55,7 +55,7 @@ messages = [
        content="what model are you"
    )
 ]
-chat(messages)
+chat.invoke(messages)
 ```

 </TabItem>
@ -64,14 +64,14 @@ chat(messages)

 ```python
 import os
-from langchain.chat_models import ChatLiteLLM
-from langchain.prompts.chat import (
+from langchain_community.chat_models import ChatLiteLLM
+from langchain_core.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
 )
-from langchain.schema import AIMessage, HumanMessage, SystemMessage
+from langchain_core.messages import AIMessage, HumanMessage, SystemMessage

 os.environ['REPLICATE_API_TOKEN'] = ""
 chat = ChatLiteLLM(model="replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1")
@ -80,7 +80,7 @@ messages = [
        content="what model are you?"
    )
 ]
-chat(messages)
+chat.invoke(messages)
 ```

 </TabItem>
@ -89,14 +89,14 @@ chat(messages)

 ```python
 import os
-from langchain.chat_models import ChatLiteLLM
-from langchain.prompts.chat import (
+from langchain_community.chat_models import ChatLiteLLM
+from langchain_core.prompts import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
 )
-from langchain.schema import AIMessage, HumanMessage, SystemMessage
+from langchain_core.messages import AIMessage, HumanMessage, SystemMessage

 os.environ['COHERE_API_KEY'] = ""
 chat = ChatLiteLLM(model="command-nightly")
@ -105,32 +105,9 @@ messages = [
        content="what model are you?"
    )
 ]
-chat(messages)
+chat.invoke(messages)
 ```

-</TabItem>
-<TabItem value="palm" label="PaLM - Google">
-
-```python
-import os
-from langchain.chat_models import ChatLiteLLM
-from langchain.prompts.chat import (
-    ChatPromptTemplate,
-    SystemMessagePromptTemplate,
-    AIMessagePromptTemplate,
-    HumanMessagePromptTemplate,
-)
-from langchain.schema import AIMessage, HumanMessage, SystemMessage
-
-os.environ['PALM_API_KEY'] = ""
-chat = ChatLiteLLM(model="palm/chat-bison")
-messages = [
-    HumanMessage(
-        content="what model are you?"
-    )
-]
-chat(messages)
-```
 </TabItem>
 </Tabs>

--- a/docs/my-website/docs/load_test.md
+++ b/docs/my-website/docs/load_test.md
@ -213,3 +213,349 @@ asyncio.run(loadtest_fn())

 ```

+## Multi-Instance TPM/RPM Load Test (Router)
+
+Test if your defined tpm/rpm limits are respected across multiple instances of the Router object. 
+
+In our test:
+- Max RPM per deployment is = 100 requests per minute
+- Max Throughput / min on router = 200 requests per minute (2 deployments)
+- Load we'll send through router = 600 requests per minute
+
+:::info
+
+If you don't want to call a real LLM API endpoint, you can setup a fake openai server. [See code](#extra---setup-fake-openai-server)
+
+:::
+
+### Code 
+
+Let's hit the router with 600 requests per minute. 
+
+Copy this script 👇. Save it as `test_loadtest_router.py` AND run it with `python3 test_loadtest_router.py`
+
+
+```python
+from litellm import Router 
+import litellm
+litellm.suppress_debug_info = True
+litellm.set_verbose = False
+import logging
+logging.basicConfig(level=logging.CRITICAL)
+import os, random, uuid, time, asyncio
+
+# Model list for OpenAI and Anthropic models
+model_list = [
+    {
+        "model_name": "fake-openai-endpoint",
+        "litellm_params": {
+            "model": "gpt-3.5-turbo",
+            "api_key": "my-fake-key",
+            "api_base": "http://0.0.0.0:8080",
+            "rpm": 100
+        },
+    },
+    {
+        "model_name": "fake-openai-endpoint",
+        "litellm_params": {
+            "model": "gpt-3.5-turbo",
+            "api_key": "my-fake-key",
+            "api_base": "http://0.0.0.0:8081",
+            "rpm": 100
+        },
+    },
+]
+
+router_1 = Router(model_list=model_list, num_retries=0, enable_pre_call_checks=True, routing_strategy="usage-based-routing-v2", redis_host=os.getenv("REDIS_HOST"), redis_port=os.getenv("REDIS_PORT"), redis_password=os.getenv("REDIS_PASSWORD"))
+router_2 = Router(model_list=model_list, num_retries=0, routing_strategy="usage-based-routing-v2", enable_pre_call_checks=True, redis_host=os.getenv("REDIS_HOST"), redis_port=os.getenv("REDIS_PORT"), redis_password=os.getenv("REDIS_PASSWORD"))
+
+
+
+async def router_completion_non_streaming():
+  try:
+    client: Router = random.sample([router_1, router_2], 1)[0] # randomly pick b/w clients
+    # print(f"client={client}")
+    response = await client.acompletion(
+              model="fake-openai-endpoint", # [CHANGE THIS] (if you call it something else on your proxy)
+              messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
+          )
+    return response
+  except Exception as e:
+    # print(e)
+    return None
+  
+async def loadtest_fn():
+    start = time.time()
+    n = 600  # Number of concurrent tasks
+    tasks = [router_completion_non_streaming() for _ in range(n)]
+    chat_completions = await asyncio.gather(*tasks)
+    successful_completions = [c for c in chat_completions if c is not None]
+    print(n, time.time() - start, len(successful_completions))
+
+def get_utc_datetime():
+    import datetime as dt
+    from datetime import datetime
+
+    if hasattr(dt, "UTC"):
+        return datetime.now(dt.UTC)  # type: ignore
+    else:
+        return datetime.utcnow()  # type: ignore
+
+
+# Run the event loop to execute the async function
+async def parent_fn():
+  for _ in range(10):
+    dt = get_utc_datetime()
+    current_minute = dt.strftime("%H-%M")
+    print(f"triggered new batch - {current_minute}")
+    await loadtest_fn()
+    await asyncio.sleep(10)
+
+asyncio.run(parent_fn())
+```
+## Multi-Instance TPM/RPM Load Test (Proxy)
+
+Test if your defined tpm/rpm limits are respected across multiple instances. 
+
+The quickest way to do this is by testing the [proxy](./proxy/quick_start.md). The proxy uses the [router](./routing.md) under the hood, so if you're using either of them, this test should work for you. 
+
+In our test:
+- Max RPM per deployment is = 100 requests per minute
+- Max Throughput / min on proxy = 200 requests per minute (2 deployments)
+- Load we'll send to proxy = 600 requests per minute
+
+
+So we'll send 600 requests per minute, but expect only 200 requests per minute to succeed.
+
+:::info
+
+If you don't want to call a real LLM API endpoint, you can setup a fake openai server. [See code](#extra---setup-fake-openai-server)
+
+:::
+
+### 1. Setup config 
+
+```yaml
+model_list:
+- litellm_params:
+    api_base: http://0.0.0.0:8080
+    api_key: my-fake-key
+    model: openai/my-fake-model
+    rpm: 100
+  model_name: fake-openai-endpoint
+- litellm_params:
+    api_base: http://0.0.0.0:8081
+    api_key: my-fake-key
+    model: openai/my-fake-model-2
+    rpm: 100
+  model_name: fake-openai-endpoint
+router_settings:
+  num_retries: 0
+  enable_pre_call_checks: true
+  redis_host: os.environ/REDIS_HOST ## 👈 IMPORTANT! Setup the proxy w/ redis
+  redis_password: os.environ/REDIS_PASSWORD
+  redis_port: os.environ/REDIS_PORT
+  routing_strategy: usage-based-routing-v2
+```
+
+### 2. Start proxy 2 instances
+
+**Instance 1**
+```bash
+litellm --config /path/to/config.yaml --port 4000
+
+## RUNNING on http://0.0.0.0:4000
+```
+
+**Instance 2**
+```bash
+litellm --config /path/to/config.yaml --port 4001
+
+## RUNNING on http://0.0.0.0:4001
+```
+
+### 3. Run Test 
+
+Let's hit the proxy with 600 requests per minute. 
+
+Copy this script 👇. Save it as `test_loadtest_proxy.py` AND run it with `python3 test_loadtest_proxy.py`
+
+```python
+from openai import AsyncOpenAI, AsyncAzureOpenAI
+import random, uuid
+import time, asyncio, litellm
+# import logging
+# logging.basicConfig(level=logging.DEBUG)
+#### LITELLM PROXY #### 
+litellm_client = AsyncOpenAI(
+    api_key="sk-1234", # [CHANGE THIS]
+    base_url="http://0.0.0.0:4000"
+)
+litellm_client_2 = AsyncOpenAI(
+    api_key="sk-1234", # [CHANGE THIS]
+    base_url="http://0.0.0.0:4001"
+)
+
+async def proxy_completion_non_streaming():
+  try:
+    client = random.sample([litellm_client, litellm_client_2], 1)[0] # randomly pick b/w clients
+    # print(f"client={client}")
+    response = await client.chat.completions.create(
+              model="fake-openai-endpoint", # [CHANGE THIS] (if you call it something else on your proxy)
+              messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
+          )
+    return response
+  except Exception as e:
+    # print(e)
+    return None
+  
+async def loadtest_fn():
+    start = time.time()
+    n = 600  # Number of concurrent tasks
+    tasks = [proxy_completion_non_streaming() for _ in range(n)]
+    chat_completions = await asyncio.gather(*tasks)
+    successful_completions = [c for c in chat_completions if c is not None]
+    print(n, time.time() - start, len(successful_completions))
+
+def get_utc_datetime():
+    import datetime as dt
+    from datetime import datetime
+
+    if hasattr(dt, "UTC"):
+        return datetime.now(dt.UTC)  # type: ignore
+    else:
+        return datetime.utcnow()  # type: ignore
+
+
+# Run the event loop to execute the async function
+async def parent_fn():
+  for _ in range(10):
+    dt = get_utc_datetime()
+    current_minute = dt.strftime("%H-%M")
+    print(f"triggered new batch - {current_minute}")
+    await loadtest_fn()
+    await asyncio.sleep(10)
+
+asyncio.run(parent_fn())
+
+```
+
+
+### Extra - Setup Fake OpenAI Server 
+
+Let's setup a fake openai server with a RPM limit of 100.
+
+Let's call our file `fake_openai_server.py`. 
+
+```
+# import sys, os
+# sys.path.insert(
+#     0, os.path.abspath("../")
+# )  # Adds the parent directory to the system path
+from fastapi import FastAPI, Request, status, HTTPException, Depends
+from fastapi.responses import StreamingResponse
+from fastapi.security import OAuth2PasswordBearer
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+from fastapi import FastAPI, Request, HTTPException, UploadFile, File
+import httpx, os, json
+from openai import AsyncOpenAI
+from typing import Optional
+from slowapi import Limiter
+from slowapi.util import get_remote_address
+from slowapi.errors import RateLimitExceeded
+from fastapi import FastAPI, Request, HTTPException
+from fastapi.responses import PlainTextResponse
+
+
+class ProxyException(Exception):
+    # NOTE: DO NOT MODIFY THIS
+    # This is used to map exactly to OPENAI Exceptions
+    def __init__(
+        self,
+        message: str,
+        type: str,
+        param: Optional[str],
+        code: Optional[int],
+    ):
+        self.message = message
+        self.type = type
+        self.param = param
+        self.code = code
+
+    def to_dict(self) -> dict:
+        """Converts the ProxyException instance to a dictionary."""
+        return {
+            "message": self.message,
+            "type": self.type,
+            "param": self.param,
+            "code": self.code,
+        }
+
+
+limiter = Limiter(key_func=get_remote_address)
+app = FastAPI()
+app.state.limiter = limiter
+
+@app.exception_handler(RateLimitExceeded)
+async def _rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded):
+    return JSONResponse(status_code=429,
+                        content={"detail": "Rate Limited!"})
+
+app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+# for completion
+@app.post("/chat/completions")
+@app.post("/v1/chat/completions")
+@limiter.limit("100/minute")
+async def completion(request: Request):
+    # raise HTTPException(status_code=429, detail="Rate Limited!")
+    return {
+        "id": "chatcmpl-123",
+        "object": "chat.completion",
+        "created": 1677652288,
+        "model": None,
+        "system_fingerprint": "fp_44709d6fcb",
+        "choices": [{
+            "index": 0,
+            "message": {
+            "role": "assistant",
+            "content": "\n\nHello there, how may I assist you today?",
+            },
+            "logprobs": None,
+            "finish_reason": "stop"
+        }],
+        "usage": {
+            "prompt_tokens": 9,
+            "completion_tokens": 12,
+            "total_tokens": 21
+        }
+    }
+
+if __name__ == "__main__":
+    import socket
+    import uvicorn
+    port = 8080
+    while True:
+        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        result = sock.connect_ex(('0.0.0.0', port))
+        if result != 0:
+            print(f"Port {port} is available, starting server...")
+            break
+        else:
+            port += 1
+
+    uvicorn.run(app, host="0.0.0.0", port=port)
+```
+
+```bash
+python3 fake_openai_server.py
+```
--- a/docs/my-website/docs/observability/callbacks.md
+++ b/docs/my-website/docs/observability/callbacks.md
@ -8,6 +8,7 @@ liteLLM supports:

 - [Custom Callback Functions](https://docs.litellm.ai/docs/observability/custom_callback)
 - [Lunary](https://lunary.ai/docs)
+- [Langfuse](https://langfuse.com/docs)
 - [Helicone](https://docs.helicone.ai/introduction)
 - [Traceloop](https://traceloop.com/docs)
 - [Athina](https://docs.athina.ai/)
@ -22,8 +23,8 @@ from litellm import completion

 # set callbacks
 litellm.input_callback=["sentry"] # for sentry breadcrumbing - logs the input being sent to the api
-litellm.success_callback=["posthog", "helicone", "lunary", "athina"]
-litellm.failure_callback=["sentry", "lunary"]
+litellm.success_callback=["posthog", "helicone", "langfuse", "lunary", "athina"]
+litellm.failure_callback=["sentry", "lunary", "langfuse"]

 ## set env variables
 os.environ['SENTRY_DSN'], os.environ['SENTRY_API_TRACE_RATE']= ""
@ -32,6 +33,9 @@ os.environ["HELICONE_API_KEY"] = ""
 os.environ["TRACELOOP_API_KEY"] = ""
 os.environ["LUNARY_PUBLIC_KEY"] = ""
 os.environ["ATHINA_API_KEY"] = ""
+os.environ["LANGFUSE_PUBLIC_KEY"] = ""
+os.environ["LANGFUSE_SECRET_KEY"] = ""
+os.environ["LANGFUSE_HOST"] = ""

 response = completion(model="gpt-3.5-turbo", messages=messages)
 ```
--- a/docs/my-website/docs/observability/custom_callback.md
+++ b/docs/my-website/docs/observability/custom_callback.md
@ -331,49 +331,25 @@ response = litellm.completion(model="gpt-3.5-turbo", messages=messages, metadata
 ## Examples

 ### Custom Callback to track costs for Streaming + Non-Streaming
+By default, the response cost is accessible in the logging object via `kwargs["response_cost"]` on success (sync + async)
 ```python

+# Step 1. Write your custom callback function
 def track_cost_callback(
    kwargs,                 # kwargs to completion
    completion_response,    # response from completion
    start_time, end_time    # start/end time
 ):
    try:
-        # init logging config
-        logging.basicConfig(
-                filename='cost.log',
-                level=logging.INFO,
-                format='%(asctime)s - %(message)s',
-                datefmt='%Y-%m-%d %H:%M:%S'
-        )
-
-        # check if it has collected an entire stream response
-        if "complete_streaming_response" in kwargs:
-            # for tracking streaming cost we pass the "messages" and the output_text to litellm.completion_cost 
-            completion_response=kwargs["complete_streaming_response"]
-            input_text = kwargs["messages"]
-            output_text = completion_response["choices"][0]["message"]["content"]
-            response_cost = litellm.completion_cost(
-                model = kwargs["model"],
-                messages = input_text,
-                completion=output_text
-            )
-            print("streaming response_cost", response_cost)
-            logging.info(f"Model {kwargs['model']} Cost: ${response_cost:.8f}")
-
-        # for non streaming responses
-        else:
-            # we pass the completion_response obj
-            if kwargs["stream"] != True:
-                response_cost = litellm.completion_cost(completion_response=completion_response)
-                print("regular response_cost", response_cost)
-                logging.info(f"Model {completion_response.model} Cost: ${response_cost:.8f}")
+        response_cost = kwargs["response_cost"] # litellm calculates response cost for you
+        print("regular response_cost", response_cost)
    except:
        pass

-# Assign the custom callback function
+# Step 2. Assign the custom callback function
 litellm.success_callback = [track_cost_callback]

+# Step 3. Make litellm.completion call
 response = completion(
    model="gpt-3.5-turbo",
    messages=[
--- a/docs/my-website/docs/observability/greenscale_integration.md
+++ b/docs/my-website/docs/observability/greenscale_integration.md
@ -0,0 +1,68 @@
+# Greenscale - Track LLM Spend and Responsible Usage
+
+[Greenscale](https://greenscale.ai/) is a production monitoring platform for your LLM-powered app that provides you granular key insights into your GenAI spending and responsible usage. Greenscale only captures metadata to minimize the exposure risk of personally identifiable information (PII).
+
+## Getting Started
+
+Use Greenscale to log requests across all LLM Providers
+
+liteLLM provides `callbacks`, making it easy for you to log data depending on the status of your responses.
+
+## Using Callbacks
+
+First, email `hello@greenscale.ai` to get an API_KEY.
+
+Use just 1 line of code, to instantly log your responses **across all providers** with Greenscale:
+
+```python
+litellm.success_callback = ["greenscale"]
+```
+
+### Complete code
+
+```python
+from litellm import completion
+
+## set env variables
+os.environ['GREENSCALE_API_KEY'] = 'your-greenscale-api-key'
+os.environ['GREENSCALE_ENDPOINT'] = 'greenscale-endpoint'
+os.environ["OPENAI_API_KEY"]= ""
+
+# set callback
+litellm.success_callback = ["greenscale"]
+
+#openai call
+response = completion(
+  model="gpt-3.5-turbo",
+  messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}]
+  metadata={
+    "greenscale_project": "acme-project",
+    "greenscale_application": "acme-application"
+  }
+)
+```
+
+## Additional information in metadata
+
+You can send any additional information to Greenscale by using the `metadata` field in completion and `greenscale_` prefix. This can be useful for sending metadata about the request, such as the project and application name, customer_id, enviornment, or any other information you want to track usage. `greenscale_project` and `greenscale_application` are required fields.
+
+```python
+#openai call with additional metadata
+response = completion(
+  model="gpt-3.5-turbo",
+  messages=[
+    {"role": "user", "content": "Hi 👋 - i'm openai"}
+  ],
+  metadata={
+    "greenscale_project": "acme-project",
+    "greenscale_application": "acme-application",
+    "greenscale_customer_id": "customer-123"
+  }
+)
+```
+
+## Support & Talk with Greenscale Team
+
+- [Schedule Demo 👋](https://calendly.com/nandesh/greenscale)
+- [Website 💻](https://greenscale.ai)
+- Our email ✉️ `hello@greenscale.ai`
--- a/docs/my-website/docs/observability/lago.md
+++ b/docs/my-website/docs/observability/lago.md
@ -0,0 +1,173 @@
+import Image from '@theme/IdealImage';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Lago - Usage Based Billing
+
+[Lago](https://www.getlago.com/) offers a self-hosted and cloud, metering and usage-based billing solution.
+
+<Image img={require('../../img/lago.jpeg')} />
+
+## Quick Start
+Use just 1 lines of code, to instantly log your responses **across all providers** with Lago
+
+Get your Lago [API Key](https://docs.getlago.com/guide/self-hosted/docker#find-your-api-key)
+
+```python
+litellm.callbacks = ["lago"] # logs cost + usage of successful calls to lago
+```
+
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+# pip install lago 
+import litellm
+import os
+
+os.environ["LAGO_API_BASE"] = "" # http://0.0.0.0:3000
+os.environ["LAGO_API_KEY"] = ""
+os.environ["LAGO_API_EVENT_CODE"] = "" # The billable metric's code - https://docs.getlago.com/guide/events/ingesting-usage#define-a-billable-metric
+
+# LLM API Keys
+os.environ['OPENAI_API_KEY']=""
+
+# set lago as a callback, litellm will send the data to lago
+litellm.success_callback = ["lago"] 
+ 
+# openai call
+response = litellm.completion(
+  model="gpt-3.5-turbo",
+  messages=[
+    {"role": "user", "content": "Hi 👋 - i'm openai"}
+  ],
+  user="your_customer_id" # 👈 SET YOUR CUSTOMER ID HERE
+)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Add to Config.yaml
+```yaml
+model_list:
+- litellm_params:
+    api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
+    api_key: my-fake-key
+    model: openai/my-fake-model
+  model_name: fake-openai-endpoint
+
+litellm_settings:
+  callbacks: ["lago"] # 👈 KEY CHANGE
+```
+
+2. Start Proxy
+
+```
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+<Tabs>
+<TabItem value="curl" label="Curl">
+
+```bash
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+--header 'Content-Type: application/json' \
+--data ' {
+      "model": "fake-openai-endpoint",
+      "messages": [
+        {
+          "role": "user",
+          "content": "what llm are you"
+        }
+      ],
+      "user": "your-customer-id" # 👈 SET YOUR CUSTOMER ID
+    }
+'
+```
+</TabItem>
+<TabItem value="openai_python" label="OpenAI Python SDK">
+
+```python
+import openai
+client = openai.OpenAI(
+    api_key="anything",
+    base_url="http://0.0.0.0:4000"
+)
+
+# request sent to model set on litellm proxy, `litellm --model`
+response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
+    {
+        "role": "user",
+        "content": "this is a test request, write a short poem"
+    }
+], user="my_customer_id") # 👈 whatever your customer id is
+
+print(response)
+```
+</TabItem>
+<TabItem value="langchain" label="Langchain">
+
+```python
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts.chat import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+    SystemMessagePromptTemplate,
+)
+from langchain.schema import HumanMessage, SystemMessage
+import os 
+
+os.environ["OPENAI_API_KEY"] = "anything"
+
+chat = ChatOpenAI(
+    openai_api_base="http://0.0.0.0:4000",
+    model = "gpt-3.5-turbo",
+    temperature=0.1,
+    extra_body={
+        "user": "my_customer_id"  # 👈 whatever your customer id is
+    }
+)
+
+messages = [
+    SystemMessage(
+        content="You are a helpful assistant that im using to make a test request to."
+    ),
+    HumanMessage(
+        content="test from litellm. tell me why it's amazing in 1 sentence"
+    ),
+]
+response = chat(messages)
+
+print(response)
+```
+</TabItem>
+</Tabs>
+</TabItem>
+</Tabs>
+
+
+<Image img={require('../../img/lago_2.png')} />
+
+## Advanced - Lagos Logging object 
+
+This is what LiteLLM will log to Lagos
+
+```
+{
+    "event": {
+      "transaction_id": "<generated_unique_id>",
+      "external_customer_id": <litellm_end_user_id>, # passed via `user` param in /chat/completion call - https://platform.openai.com/docs/api-reference/chat/create
+      "code": os.getenv("LAGO_API_EVENT_CODE"), 
+      "properties": {
+          "input_tokens": <number>,
+          "output_tokens": <number>,
+          "model": <string>,
+          "response_cost": <number>, # 👈 LITELLM CALCULATED RESPONSE COST - https://github.com/BerriAI/litellm/blob/d43f75150a65f91f60dc2c0c9462ce3ffc713c1f/litellm/utils.py#L1473
+      }
+    }
+}
+```
--- a/docs/my-website/docs/observability/langfuse_integration.md
+++ b/docs/my-website/docs/observability/langfuse_integration.md
@ -94,9 +94,10 @@ print(response)

 ```

-### Set Custom Trace ID, Trace User ID and Tags
+### Set Custom Trace ID, Trace User ID, Trace Metadata, Trace Version, Trace Release and Tags
+
+Pass `trace_id`, `trace_user_id`, `trace_metadata`, `trace_version`, `trace_release`, `tags` in `metadata`

-Pass `trace_id`, `trace_user_id` in `metadata`

 ```python
 import litellm
@ -121,10 +122,21 @@ response = completion(
  metadata={
      "generation_name": "ishaan-test-generation",  # set langfuse Generation Name
      "generation_id": "gen-id22",                  # set langfuse Generation ID 
-      "trace_id": "trace-id22",                     # set langfuse Trace ID
+      "version":  "test-generation-version"         # set langfuse Generation Version
      "trace_user_id": "user-id2",                  # set langfuse Trace User ID
      "session_id": "session-1",                    # set langfuse Session ID
-      "tags": ["tag1", "tag2"]                      # set langfuse Tags
+      "tags": ["tag1", "tag2"],                     # set langfuse Tags
+      "trace_id": "trace-id22",                     # set langfuse Trace ID
+      "trace_metadata": {"key": "value"},           # set langfuse Trace Metadata
+      "trace_version": "test-trace-version",        # set langfuse Trace Version (if not set, defaults to Generation Version)
+      "trace_release": "test-trace-release",        # set langfuse Trace Release
+      ### OR ### 
+      "existing_trace_id": "trace-id22",            # if generation is continuation of past trace. This prevents default behaviour of setting a trace name
+      ### OR enforce that certain fields are trace overwritten in the trace during the continuation ###
+      "existing_trace_id": "trace-id22",
+      "trace_metadata": {"key": "updated_trace_value"},            # The new value to use for the langfuse Trace Metadata
+      "update_trace_keys": ["input", "output", "trace_metadata"],  # Updates the trace input & output to be this generations input & output also updates the Trace Metadata to match the passed in value
+      "debug_langfuse": True,                                      # Will log the exact metadata sent to litellm for the trace/generation as `metadata_passed_to_litellm` 
  },
 )

@ -132,6 +144,38 @@ print(response)

 ```

+### Trace & Generation Parameters
+
+#### Trace Specific Parameters
+
+* `trace_id`       - Identifier for the trace, must use `existing_trace_id` instead of `trace_id` if this is an existing trace, auto-generated by default
+* `trace_name`     - Name of the trace, auto-generated by default
+* `session_id`     - Session identifier for the trace, defaults to `None`
+* `trace_version`  - Version for the trace, defaults to value for `version`
+* `trace_release`  - Release for the trace, defaults to `None`
+* `trace_metadata` - Metadata for the trace, defaults to `None`
+* `trace_user_id`  - User identifier for the trace, defaults to completion argument `user`
+* `tags`           - Tags for the trace, defeaults to `None`
+
+##### Updatable Parameters on Continuation
+
+The following parameters can be updated on a continuation of a trace by passing in the following values into the `update_trace_keys` in the metadata of the completion.
+
+* `input`          - Will set the traces input to be the input of this latest generation
+* `output`         - Will set the traces output to be the output of this generation
+* `trace_version`  - Will set the trace version to be the provided value (To use the latest generations version instead, use `version`)
+* `trace_release`  - Will set the trace release to be the provided value
+* `trace_metadata` - Will set the trace metadata to the provided value
+* `trace_user_id`  - Will set the trace user id to the provided value
+
+#### Generation Specific Parameters
+
+* `generation_id`   - Identifier for the generation, auto-generated by default
+* `generation_name` - Identifier for the generation, auto-generated by default
+* `prompt`          - Langfuse prompt object used for the generation, defaults to None
+
+Any other key value pairs passed into the metadata not listed in the above spec for a `litellm` completion will be added as a metadata key value pair for the generation.
+
 ### Use LangChain ChatLiteLLM + Langfuse
 Pass `trace_user_id`, `session_id` in model_kwargs
 ```python
@ -167,6 +211,21 @@ messages = [
 chat(messages)
 ```

+## Redacting Messages, Response Content from Langfuse Logging 
+
+### Redact Messages and Responses from all Langfuse Logging
+
+Set `litellm.turn_off_message_logging=True` This will prevent the messages and responses from being logged to langfuse, but request metadata will still be logged.
+
+### Redact Messages and Responses from specific Langfuse Logging
+
+In the metadata typically passed for text completion or embedding calls you can set specific keys to mask the messages and responses for this call.
+
+Setting `mask_input` to `True` will mask the input from being logged for this call 
+
+Setting `mask_output` to `True` will make the output from being logged for this call.
+
+Be aware that if you are continuing an existing trace, and you set `update_trace_keys` to include either `input` or `output` and you set the corresponding `mask_input` or `mask_output`, then that trace will have its existing input and/or output replaced with a redacted message.

 ## Troubleshooting & Errors
 ### Data not getting logged to Langfuse ? 
--- a/docs/my-website/docs/observability/langsmith_integration.md
+++ b/docs/my-website/docs/observability/langsmith_integration.md
@ -57,7 +57,7 @@ os.environ["LANGSMITH_API_KEY"] = ""
 os.environ['OPENAI_API_KEY']=""

 # set langfuse as a callback, litellm will send the data to langfuse
-litellm.success_callback = ["langfuse"] 
+litellm.success_callback = ["langsmith"] 
 
 response = litellm.completion(
    model="gpt-3.5-turbo",
@ -71,6 +71,23 @@ response = litellm.completion(
 )
 print(response)
 ```
+
+### Make LiteLLM Proxy use Custom `LANGSMITH_BASE_URL`
+
+If you're using a custom LangSmith instance, you can set the
+`LANGSMITH_BASE_URL` environment variable to point to your instance.
+For example, you can make LiteLLM Proxy log to a local LangSmith instance with
+this config:
+
+```yaml
+litellm_settings:
+  success_callback: ["langsmith"]
+
+environment_variables:
+  LANGSMITH_BASE_URL: "http://localhost:1984"
+  LANGSMITH_PROJECT: "litellm-proxy"
+```
+
 ## Support & Talk to Founders

 - [Schedule Demo 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version)
--- a/docs/my-website/docs/observability/openmeter.md
+++ b/docs/my-website/docs/observability/openmeter.md
@ -0,0 +1,97 @@
+import Image from '@theme/IdealImage';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# OpenMeter - Usage-Based Billing
+
+[OpenMeter](https://openmeter.io/) is an Open Source Usage-Based Billing solution for AI/Cloud applications. It integrates with Stripe for easy billing.
+
+<Image img={require('../../img/openmeter.png')} />
+
+:::info
+We want to learn how we can make the callbacks better! Meet the LiteLLM [founders](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version) or
+join our [discord](https://discord.gg/wuPM9dRgDw)
+::: 
+
+
+## Quick Start
+Use just 2 lines of code, to instantly log your responses **across all providers** with OpenMeter
+
+Get your OpenMeter API Key from https://openmeter.cloud/meters
+
+```python
+litellm.callbacks = ["openmeter"] # logs cost + usage of successful calls to openmeter
+```
+
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+# pip install openmeter 
+import litellm
+import os
+
+# from https://openmeter.cloud
+os.environ["OPENMETER_API_ENDPOINT"] = ""
+os.environ["OPENMETER_API_KEY"] = ""
+
+# LLM API Keys
+os.environ['OPENAI_API_KEY']=""
+
+# set openmeter as a callback, litellm will send the data to openmeter
+litellm.callbacks = ["openmeter"] 
+ 
+# openai call
+response = litellm.completion(
+  model="gpt-3.5-turbo",
+  messages=[
+    {"role": "user", "content": "Hi 👋 - i'm openai"}
+  ]
+)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Add to Config.yaml
+```yaml
+model_list:
+- litellm_params:
+    api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
+    api_key: my-fake-key
+    model: openai/my-fake-model
+  model_name: fake-openai-endpoint
+
+litellm_settings:
+  callbacks: ["openmeter"] # 👈 KEY CHANGE
+```
+
+2. Start Proxy
+
+```
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```bash
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+--header 'Content-Type: application/json' \
+--data ' {
+      "model": "fake-openai-endpoint",
+      "messages": [
+        {
+          "role": "user",
+          "content": "what llm are you"
+        }
+      ],
+    }
+'
+```
+
+</TabItem>
+</Tabs>
+
+
+<Image img={require('../../img/openmeter_img_2.png')} />
--- a/docs/my-website/docs/observability/sentry.md
+++ b/docs/my-website/docs/observability/sentry.md
@ -40,5 +40,9 @@ response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content
 print(response)
 ```

+## Redacting Messages, Response Content from Sentry Logging 
+
+Set `litellm.turn_off_message_logging=True` This will prevent the messages and responses from being logged to sentry, but request metadata will still be logged.
+
 [Let us know](https://github.com/BerriAI/litellm/issues/new?assignees=&labels=enhancement&projects=&template=feature_request.yml&title=%5BFeature%5D%3A+) if you need any additional options from Sentry. 

--- a/docs/my-website/docs/providers/anthropic.md
+++ b/docs/my-website/docs/providers/anthropic.md
@ -223,6 +223,117 @@ assert isinstance(

 ```

+### Setting `anthropic-beta` Header in Requests
+
+Pass the the `extra_headers` param to litellm, All headers will be forwarded to Anthropic API
+
+```python
+response = completion(
+    model="anthropic/claude-3-opus-20240229",
+    messages=messages,
+    tools=tools,
+)
+```
+
+### Forcing Anthropic Tool Use
+
+If you want Claude to use a specific tool to answer the user’s question
+
+You can do this by specifying the tool in the `tool_choice` field like so:
+```python
+response = completion(
+    model="anthropic/claude-3-opus-20240229",
+    messages=messages,
+    tools=tools,
+    tool_choice={"type": "tool", "name": "get_weather"},
+)
+```
+
+
+### Parallel Function Calling 
+
+Here's how to pass the result of a function call back to an anthropic model: 
+
+```python
+from litellm import completion
+import os 
+
+os.environ["ANTHROPIC_API_KEY"] = "sk-ant.."
+
+
+litellm.set_verbose = True
+
+### 1ST FUNCTION CALL ###
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city and state, e.g. San Francisco, CA",
+                    },
+                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+                },
+                "required": ["location"],
+            },
+        },
+    }
+]
+messages = [
+    {
+        "role": "user",
+        "content": "What's the weather like in Boston today in Fahrenheit?",
+    }
+]
+try:
+    # test without max tokens
+    response = completion(
+        model="anthropic/claude-3-opus-20240229",
+        messages=messages,
+        tools=tools,
+        tool_choice="auto",
+    )
+    # Add any assertions, here to check response args
+    print(response)
+    assert isinstance(response.choices[0].message.tool_calls[0].function.name, str)
+    assert isinstance(
+        response.choices[0].message.tool_calls[0].function.arguments, str
+    )
+
+    messages.append(
+        response.choices[0].message.model_dump()
+    )  # Add assistant tool invokes
+    tool_result = (
+        '{"location": "Boston", "temperature": "72", "unit": "fahrenheit"}'
+    )
+    # Add user submitted tool results in the OpenAI format
+    messages.append(
+        {
+            "tool_call_id": response.choices[0].message.tool_calls[0].id,
+            "role": "tool",
+            "name": response.choices[0].message.tool_calls[0].function.name,
+            "content": tool_result,
+        }
+    )
+    ### 2ND FUNCTION CALL ###
+    # In the second response, Claude should deduce answer from tool results
+    second_response = completion(
+        model="anthropic/claude-3-opus-20240229",
+        messages=messages,
+        tools=tools,
+        tool_choice="auto",
+    )
+    print(second_response)
+except Exception as e:
+    print(f"An error occurred - {str(e)}")
+```
+
+s/o @[Shekhar Patnaik](https://www.linkedin.com/in/patnaikshekhar) for requesting this!

 ## Usage - Vision 

--- a/docs/my-website/docs/providers/azure_ai.md
+++ b/docs/my-website/docs/providers/azure_ai.md
@ -3,8 +3,6 @@ import TabItem from '@theme/TabItem';

 # Azure AI Studio

-## Sample Usage
-
 **Ensure the following:**
 1. The API Base passed ends in the `/v1/` prefix
  example:
@ -14,8 +12,11 @@ import TabItem from '@theme/TabItem';

 2. The `model` passed is listed in [supported models](#supported-models). You **DO NOT** Need to pass your deployment name to litellm. Example `model=azure/Mistral-large-nmefg`  

+## Usage
+
+<Tabs>
+<TabItem value="sdk" label="SDK">

-**Quick Start**
 ```python
 import litellm
 response = litellm.completion(
@ -26,6 +27,9 @@ response = litellm.completion(
 )
 ```

+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
 ## Sample Usage - LiteLLM Proxy

 1. Add models to your config.yaml
@ -99,6 +103,107 @@ response = litellm.completion(

  </Tabs>

+</TabItem>
+</Tabs>
+
+## Function Calling 
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+
+# set env
+os.environ["AZURE_MISTRAL_API_KEY"] = "your-api-key"
+os.environ["AZURE_MISTRAL_API_BASE"] = "your-api-base"
+
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city and state, e.g. San Francisco, CA",
+                    },
+                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+                },
+                "required": ["location"],
+            },
+        },
+    }
+]
+messages = [{"role": "user", "content": "What's the weather like in Boston today?"}]
+
+response = completion(
+    model="azure/mistral-large-latest",
+    api_base=os.getenv("AZURE_MISTRAL_API_BASE")
+    api_key=os.getenv("AZURE_MISTRAL_API_KEY")
+    messages=messages,
+    tools=tools,
+    tool_choice="auto",
+)
+# Add any assertions, here to check response args
+print(response)
+assert isinstance(response.choices[0].message.tool_calls[0].function.name, str)
+assert isinstance(
+    response.choices[0].message.tool_calls[0].function.arguments, str
+)
+
+```
+
+</TabItem>
+
+<TabItem value="proxy" label="PROXY">
+
+```bash
+curl http://0.0.0.0:4000/v1/chat/completions \
+-H "Content-Type: application/json" \
+-H "Authorization: Bearer $YOUR_API_KEY" \
+-d '{
+  "model": "mistral",
+  "messages": [
+    {
+      "role": "user",
+      "content": "What'\''s the weather like in Boston today?"
+    }
+  ],
+  "tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "get_current_weather",
+        "description": "Get the current weather in a given location",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "location": {
+              "type": "string",
+              "description": "The city and state, e.g. San Francisco, CA"
+            },
+            "unit": {
+              "type": "string",
+              "enum": ["celsius", "fahrenheit"]
+            }
+          },
+          "required": ["location"]
+        }
+      }
+    }
+  ],
+  "tool_choice": "auto"
+}'
+
+```
+
+</TabItem>
+</Tabs>
+
 ## Supported Models

 | Model Name               | Function Call                                                                                                                                                      |
--- a/docs/my-website/docs/providers/bedrock.md
+++ b/docs/my-website/docs/providers/bedrock.md
@ -535,7 +535,8 @@ print(response)

 | Model Name           | Function Call                               |
 |----------------------|---------------------------------------------|
-| Titan Embeddings - G1 | `embedding(model="bedrock/amazon.titan-embed-text-v1", input=input)` |
+| Titan Embeddings V2 | `embedding(model="bedrock/amazon.titan-embed-text-v2:0", input=input)` |
+| Titan Embeddings - V1 | `embedding(model="bedrock/amazon.titan-embed-text-v1", input=input)` |
 | Cohere Embeddings - English | `embedding(model="bedrock/cohere.embed-english-v3", input=input)` |
 | Cohere Embeddings - Multilingual | `embedding(model="bedrock/cohere.embed-multilingual-v3", input=input)` |

--- a/docs/my-website/docs/providers/clarifai.md
+++ b/docs/my-website/docs/providers/clarifai.md
@ -0,0 +1,177 @@
+
+# Clarifai
+Anthropic, OpenAI, Mistral, Llama and Gemini LLMs are Supported on Clarifai. 
+
+## Pre-Requisites
+
+`pip install clarifai`
+
+`pip install litellm`
+
+## Required Environment Variables
+To obtain your Clarifai Personal access token follow this [link](https://docs.clarifai.com/clarifai-basics/authentication/personal-access-tokens/). Optionally the PAT can also be passed in `completion` function.
+
+```python
+os.environ["CALRIFAI_API_KEY"] = "YOUR_CLARIFAI_PAT"  # CLARIFAI_PAT
+```
+
+## Usage
+
+```python
+import os
+from litellm import completion
+
+os.environ["CLARIFAI_API_KEY"] = ""
+
+response = completion(
+  model="clarifai/mistralai.completion.mistral-large",
+  messages=[{ "content": "Tell me a joke about physics?","role": "user"}]
+)
+```
+
+**Output**
+```json
+{
+    "id": "chatcmpl-572701ee-9ab2-411c-ac75-46c1ba18e781",
+    "choices": [
+      {
+        "finish_reason": "stop",
+        "index": 1,
+        "message": {
+          "content": "Sure, here's a physics joke for you:\n\nWhy can't you trust an atom?\n\nBecause they make up everything!",
+          "role": "assistant"
+        }
+      }
+    ],
+    "created": 1714410197,
+    "model": "https://api.clarifai.com/v2/users/mistralai/apps/completion/models/mistral-large/outputs",
+    "object": "chat.completion",
+    "system_fingerprint": null,
+    "usage": {
+      "prompt_tokens": 14,
+      "completion_tokens": 24,
+      "total_tokens": 38
+    }
+  }
+```
+
+## Clarifai models
+liteLLM supports non-streaming requests to all models on [Clarifai community](https://clarifai.com/explore/models?filterData=%5B%7B%22field%22%3A%22use_cases%22%2C%22value%22%3A%5B%22llm%22%5D%7D%5D&page=1&perPage=24)
+
+Example  Usage - Note: liteLLM supports all models deployed on Clarifai
+
+## Llama LLMs
+| Model Name                        | Function Call |
+---------------------------|---------------------------------|
+| clarifai/meta.Llama-2.llama2-7b-chat    | `completion('clarifai/meta.Llama-2.llama2-7b-chat', messages)`
+| clarifai/meta.Llama-2.llama2-13b-chat   | `completion('clarifai/meta.Llama-2.llama2-13b-chat', messages)`
+| clarifai/meta.Llama-2.llama2-70b-chat   | `completion('clarifai/meta.Llama-2.llama2-70b-chat', messages)` |
+| clarifai/meta.Llama-2.codeLlama-70b-Python   | `completion('clarifai/meta.Llama-2.codeLlama-70b-Python', messages)`| 
+| clarifai/meta.Llama-2.codeLlama-70b-Instruct | `completion('clarifai/meta.Llama-2.codeLlama-70b-Instruct', messages)` |   
+
+## Mistal LLMs
+| Model Name                                  | Function Call                                                         |
+|---------------------------------------------|------------------------------------------------------------------------|
+| clarifai/mistralai.completion.mixtral-8x22B            | `completion('clarifai/mistralai.completion.mixtral-8x22B', messages)`               |
+| clarifai/mistralai.completion.mistral-large           | `completion('clarifai/mistralai.completion.mistral-large', messages)`              |
+| clarifai/mistralai.completion.mistral-medium          | `completion('clarifai/mistralai.completion.mistral-medium', messages)`             |
+| clarifai/mistralai.completion.mistral-small           | `completion('clarifai/mistralai.completion.mistral-small', messages)`              |
+| clarifai/mistralai.completion.mixtral-8x7B-Instruct-v0_1 | `completion('clarifai/mistralai.completion.mixtral-8x7B-Instruct-v0_1', messages)`
+| clarifai/mistralai.completion.mistral-7B-OpenOrca  | `completion('clarifai/mistralai.completion.mistral-7B-OpenOrca', messages)`          |
+| clarifai/mistralai.completion.openHermes-2-mistral-7B | `completion('clarifai/mistralai.completion.openHermes-2-mistral-7B', messages)`      |
+
+
+## Jurassic LLMs 
+| Model Name                                    | Function Call                                                      |
+|-----------------------------------------------|---------------------------------------------------------------------|
+| clarifai/ai21.complete.Jurassic2-Grande       | `completion('clarifai/ai21.complete.Jurassic2-Grande', messages)`       |
+| clarifai/ai21.complete.Jurassic2-Grande-Instruct | `completion('clarifai/ai21.complete.Jurassic2-Grande-Instruct', messages)` |
+| clarifai/ai21.complete.Jurassic2-Jumbo-Instruct  | `completion('clarifai/ai21.complete.Jurassic2-Jumbo-Instruct', messages)`  |
+| clarifai/ai21.complete.Jurassic2-Jumbo         | `completion('clarifai/ai21.complete.Jurassic2-Jumbo', messages)`          |
+| clarifai/ai21.complete.Jurassic2-Large         | `completion('clarifai/ai21.complete.Jurassic2-Large', messages)`          |
+
+## Wizard LLMs
+
+| Model Name                                    | Function Call                                                      |
+|-----------------------------------------------|---------------------------------------------------------------------|
+| clarifai/wizardlm.generate.wizardCoder-Python-34B | `completion('clarifai/wizardlm.generate.wizardCoder-Python-34B', messages)`    |
+| clarifai/wizardlm.generate.wizardLM-70B          | `completion('clarifai/wizardlm.generate.wizardLM-70B', messages)`             | 
+| clarifai/wizardlm.generate.wizardLM-13B          | `completion('clarifai/wizardlm.generate.wizardLM-13B', messages)`           |
+| clarifai/wizardlm.generate.wizardCoder-15B       | `completion('clarifai/wizardlm.generate.wizardCoder-15B', messages)`          |
+
+## Anthropic models
+
+| Model Name                                    | Function Call                                                      |
+|-----------------------------------------------|---------------------------------------------------------------------|
+| clarifai/anthropic.completion.claude-v1       | `completion('clarifai/anthropic.completion.claude-v1', messages)`       |
+| clarifai/anthropic.completion.claude-instant-1_2 | `completion('clarifai/anthropic.completion.claude-instant-1_2', messages)` |
+| clarifai/anthropic.completion.claude-instant  | `completion('clarifai/anthropic.completion.claude-instant', messages)`  |
+| clarifai/anthropic.completion.claude-v2       | `completion('clarifai/anthropic.completion.claude-v2', messages)`       |
+| clarifai/anthropic.completion.claude-2_1      | `completion('clarifai/anthropic.completion.claude-2_1', messages)`      |
+| clarifai/anthropic.completion.claude-3-opus   | `completion('clarifai/anthropic.completion.claude-3-opus', messages)`   |
+| clarifai/anthropic.completion.claude-3-sonnet | `completion('clarifai/anthropic.completion.claude-3-sonnet', messages)` |
+
+## OpenAI GPT LLMs
+
+| Model Name                                    | Function Call                                                      |
+|-----------------------------------------------|---------------------------------------------------------------------|
+| clarifai/openai.chat-completion.GPT-4         | `completion('clarifai/openai.chat-completion.GPT-4', messages)`          |
+| clarifai/openai.chat-completion.GPT-3_5-turbo | `completion('clarifai/openai.chat-completion.GPT-3_5-turbo', messages)`  |
+| clarifai/openai.chat-completion.gpt-4-turbo   | `completion('clarifai/openai.chat-completion.gpt-4-turbo', messages)`    |
+| clarifai/openai.completion.gpt-3_5-turbo-instruct | `completion('clarifai/openai.completion.gpt-3_5-turbo-instruct', messages)` |
+
+## GCP LLMs
+
+| Model Name                                    | Function Call                                                      |
+|-----------------------------------------------|---------------------------------------------------------------------|
+| clarifai/gcp.generate.gemini-1_5-pro         | `completion('clarifai/gcp.generate.gemini-1_5-pro', messages)`          |
+| clarifai/gcp.generate.imagen-2               | `completion('clarifai/gcp.generate.imagen-2', messages)`                |
+| clarifai/gcp.generate.code-gecko             | `completion('clarifai/gcp.generate.code-gecko', messages)`              |
+| clarifai/gcp.generate.code-bison             | `completion('clarifai/gcp.generate.code-bison', messages)`              |
+| clarifai/gcp.generate.text-bison            | `completion('clarifai/gcp.generate.text-bison', messages)`               |
+| clarifai/gcp.generate.gemma-2b-it            | `completion('clarifai/gcp.generate.gemma-2b-it', messages)`              |
+| clarifai/gcp.generate.gemma-7b-it            | `completion('clarifai/gcp.generate.gemma-7b-it', messages)`              |
+| clarifai/gcp.generate.gemini-pro            | `completion('clarifai/gcp.generate.gemini-pro', messages)`               |
+| clarifai/gcp.generate.gemma-1_1-7b-it       | `completion('clarifai/gcp.generate.gemma-1_1-7b-it', messages)`          |
+
+## Cohere LLMs
+| Model Name                                    | Function Call                                                      |
+|-----------------------------------------------|---------------------------------------------------------------------|
+| clarifai/cohere.generate.cohere-generate-command | `completion('clarifai/cohere.generate.cohere-generate-command', messages)` |
+ clarifai/cohere.generate.command-r-plus' | `completion('clarifai/clarifai/cohere.generate.command-r-plus', messages)`|
+
+## Databricks LLMs
+
+| Model Name                                        | Function Call                                                      |
+|---------------------------------------------------|---------------------------------------------------------------------|
+| clarifai/databricks.drbx.dbrx-instruct           | `completion('clarifai/databricks.drbx.dbrx-instruct', messages)`   |
+| clarifai/databricks.Dolly-v2.dolly-v2-12b        | `completion('clarifai/databricks.Dolly-v2.dolly-v2-12b', messages)`|
+
+## Microsoft LLMs
+
+| Model Name                                        | Function Call                                                      |
+|---------------------------------------------------|---------------------------------------------------------------------|
+| clarifai/microsoft.text-generation.phi-2          | `completion('clarifai/microsoft.text-generation.phi-2', messages)`  |
+| clarifai/microsoft.text-generation.phi-1_5        | `completion('clarifai/microsoft.text-generation.phi-1_5', messages)`|
+
+## Salesforce models
+
+| Model Name                                                | Function Call                                                                |
+|-----------------------------------------------------------|-------------------------------------------------------------------------------|
+| clarifai/salesforce.blip.general-english-image-caption-blip-2 | `completion('clarifai/salesforce.blip.general-english-image-caption-blip-2', messages)` |
+| clarifai/salesforce.xgen.xgen-7b-8k-instruct             | `completion('clarifai/salesforce.xgen.xgen-7b-8k-instruct', messages)`         |
+
+
+## Other Top performing LLMs
+
+| Model Name                                        | Function Call                                                      |
+|---------------------------------------------------|---------------------------------------------------------------------|
+| clarifai/deci.decilm.deciLM-7B-instruct          | `completion('clarifai/deci.decilm.deciLM-7B-instruct', messages)`  |
+| clarifai/upstage.solar.solar-10_7b-instruct      | `completion('clarifai/upstage.solar.solar-10_7b-instruct', messages)` |
+| clarifai/openchat.openchat.openchat-3_5-1210     | `completion('clarifai/openchat.openchat.openchat-3_5-1210', messages)` |
+| clarifai/togethercomputer.stripedHyena.stripedHyena-Nous-7B | `completion('clarifai/togethercomputer.stripedHyena.stripedHyena-Nous-7B', messages)` |
+| clarifai/fblgit.una-cybertron.una-cybertron-7b-v2 | `completion('clarifai/fblgit.una-cybertron.una-cybertron-7b-v2', messages)` |
+| clarifai/tiiuae.falcon.falcon-40b-instruct       | `completion('clarifai/tiiuae.falcon.falcon-40b-instruct', messages)` |
+| clarifai/togethercomputer.RedPajama.RedPajama-INCITE-7B-Chat | `completion('clarifai/togethercomputer.RedPajama.RedPajama-INCITE-7B-Chat', messages)` |
+| clarifai/bigcode.code.StarCoder                  | `completion('clarifai/bigcode.code.StarCoder', messages)`           |
+| clarifai/mosaicml.mpt.mpt-7b-instruct            | `completion('clarifai/mosaicml.mpt.mpt-7b-instruct', messages)`     |
--- a/docs/my-website/docs/providers/deepseek.md
+++ b/docs/my-website/docs/providers/deepseek.md
@ -0,0 +1,54 @@
+# Deepseek
+https://deepseek.com/
+
+**We support ALL Deepseek models, just set `deepseek/` as a prefix when sending completion requests**
+
+## API Key
+```python
+# env variable
+os.environ['DEEPSEEK_API_KEY']
+```
+
+## Sample Usage
+```python
+from litellm import completion
+import os
+
+os.environ['DEEPSEEK_API_KEY'] = ""
+response = completion(
+    model="deepseek/deepseek-chat", 
+    messages=[
+       {"role": "user", "content": "hello from litellm"}
+   ],
+)
+print(response)
+```
+
+## Sample Usage - Streaming
+```python
+from litellm import completion
+import os
+
+os.environ['DEEPSEEK_API_KEY'] = ""
+response = completion(
+    model="deepseek/deepseek-chat", 
+    messages=[
+       {"role": "user", "content": "hello from litellm"}
+   ],
+    stream=True
+)
+
+for chunk in response:
+    print(chunk)
+```
+
+
+## Supported Models - ALL Deepseek Models Supported!
+We support ALL Deepseek models, just set `deepseek/` as a prefix when sending completion requests
+
+| Model Name               | Function Call                                                                                                                                                      |
+|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| deepseek-chat | `completion(model="deepseek/deepseek-chat", messages)` | 
+| deepseek-coder | `completion(model="deepseek/deepseek-chat", messages)` | 
+
+
--- a/docs/my-website/docs/providers/gemini.md
+++ b/docs/my-website/docs/providers/gemini.md
@ -23,7 +23,7 @@ In certain use-cases you may need to make calls to the models and pass [safety s
 ```python
 response = completion(
    model="gemini/gemini-pro", 
-    messages=[{"role": "user", "content": "write code for saying hi from LiteLLM"}]
+    messages=[{"role": "user", "content": "write code for saying hi from LiteLLM"}],
    safety_settings=[
        {
            "category": "HARM_CATEGORY_HARASSMENT",
--- a/docs/my-website/docs/providers/groq.md
+++ b/docs/my-website/docs/providers/groq.md
@ -48,6 +48,109 @@ We support ALL Groq models, just set `groq/` as a prefix when sending completion

 | Model Name               | Function Call                                                                                                                                                      |
 |--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| llama3-8b-8192 | `completion(model="groq/llama3-8b-8192", messages)` | 
+| llama3-70b-8192 | `completion(model="groq/llama3-70b-8192", messages)` | 
 | llama2-70b-4096 | `completion(model="groq/llama2-70b-4096", messages)` | 
 | mixtral-8x7b-32768 | `completion(model="groq/mixtral-8x7b-32768", messages)` |
 | gemma-7b-it | `completion(model="groq/gemma-7b-it", messages)` |  
+
+## Groq - Tool / Function Calling Example
+
+```python
+# Example dummy function hard coded to return the current weather
+import json
+def get_current_weather(location, unit="fahrenheit"):
+    """Get the current weather in a given location"""
+    if "tokyo" in location.lower():
+        return json.dumps({"location": "Tokyo", "temperature": "10", "unit": "celsius"})
+    elif "san francisco" in location.lower():
+        return json.dumps(
+            {"location": "San Francisco", "temperature": "72", "unit": "fahrenheit"}
+        )
+    elif "paris" in location.lower():
+        return json.dumps({"location": "Paris", "temperature": "22", "unit": "celsius"})
+    else:
+        return json.dumps({"location": location, "temperature": "unknown"})
+
+
+
+
+# Step 1: send the conversation and available functions to the model
+messages = [
+    {
+        "role": "system",
+        "content": "You are a function calling LLM that uses the data extracted from get_current_weather to answer questions about the weather in San Francisco.",
+    },
+    {
+        "role": "user",
+        "content": "What's the weather like in San Francisco?",
+    },
+]
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city and state, e.g. San Francisco, CA",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
+                },
+                "required": ["location"],
+            },
+        },
+    }
+]
+response = litellm.completion(
+    model="groq/llama2-70b-4096",
+    messages=messages,
+    tools=tools,
+    tool_choice="auto",  # auto is default, but we'll be explicit
+)
+print("Response\n", response)
+response_message = response.choices[0].message
+tool_calls = response_message.tool_calls
+
+
+# Step 2: check if the model wanted to call a function
+if tool_calls:
+    # Step 3: call the function
+    # Note: the JSON response may not always be valid; be sure to handle errors
+    available_functions = {
+        "get_current_weather": get_current_weather,
+    }
+    messages.append(
+        response_message
+    )  # extend conversation with assistant's reply
+    print("Response message\n", response_message)
+    # Step 4: send the info for each function call and function response to the model
+    for tool_call in tool_calls:
+        function_name = tool_call.function.name
+        function_to_call = available_functions[function_name]
+        function_args = json.loads(tool_call.function.arguments)
+        function_response = function_to_call(
+            location=function_args.get("location"),
+            unit=function_args.get("unit"),
+        )
+        messages.append(
+            {
+                "tool_call_id": tool_call.id,
+                "role": "tool",
+                "name": function_name,
+                "content": function_response,
+            }
+        )  # extend conversation with function response
+    print(f"messages: {messages}")
+    second_response = litellm.completion(
+        model="groq/llama2-70b-4096", messages=messages
+    )  # get a new response from the model where it can see the function response
+    print("second response\n", second_response)
+```
--- a/docs/my-website/docs/providers/huggingface.md
+++ b/docs/my-website/docs/providers/huggingface.md
@ -21,6 +21,11 @@ This is done by adding the "huggingface/" prefix to `model`, example `completion
 <Tabs>
 <TabItem value="tgi" label="Text-generation-interface (TGI)">

+By default, LiteLLM will assume a huggingface call follows the TGI format.
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
 ```python
 import os 
 from litellm import completion 
@ -40,9 +45,58 @@ response = completion(
 print(response)
 ```

+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Add models to your config.yaml
+
+  ```yaml
+  model_list:
+    - model_name: wizard-coder
+      litellm_params:
+        model: huggingface/WizardLM/WizardCoder-Python-34B-V1.0
+        api_key: os.environ/HUGGINGFACE_API_KEY
+        api_base: "https://my-endpoint.endpoints.huggingface.cloud"
+  ```
+
+
+
+2. Start the proxy 
+
+  ```bash
+  $ litellm --config /path/to/config.yaml --debug
+  ```
+
+3. Test it!
+
+  ```shell
+  curl --location 'http://0.0.0.0:4000/chat/completions' \
+      --header 'Authorization: Bearer sk-1234' \
+      --header 'Content-Type: application/json' \
+      --data '{
+      "model": "wizard-coder",
+      "messages": [
+        {
+            "role": "user",
+            "content": "I like you!"
+        }
+        ],
+  }'
+  ```
+
+
+</TabItem> 
+</Tabs>
 </TabItem>
 <TabItem value="conv" label="Conversational-task (BlenderBot, etc.)">

+Append `conversational` to the model name 
+
+e.g. `huggingface/conversational/<model-name>`
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
 ```python
 import os 
 from litellm import completion 
@ -54,7 +108,7 @@ messages = [{ "content": "There's a llama in my garden 😱 What should I do?","

 # e.g. Call 'facebook/blenderbot-400M-distill' hosted on HF Inference endpoints
 response = completion(
-  model="huggingface/facebook/blenderbot-400M-distill", 
+  model="huggingface/conversational/facebook/blenderbot-400M-distill", 
  messages=messages, 
  api_base="https://my-endpoint.huggingface.cloud"
 )
@ -62,7 +116,123 @@ response = completion(
 print(response)
 ```
 </TabItem>
-<TabItem value="none" label="Non TGI/Conversational-task LLMs">
+<TabItem value="proxy" label="PROXY">
+
+1. Add models to your config.yaml
+
+  ```yaml
+  model_list:
+    - model_name: blenderbot
+      litellm_params:
+        model: huggingface/conversational/facebook/blenderbot-400M-distill
+        api_key: os.environ/HUGGINGFACE_API_KEY
+        api_base: "https://my-endpoint.endpoints.huggingface.cloud"
+  ```
+
+
+
+2. Start the proxy 
+
+  ```bash
+  $ litellm --config /path/to/config.yaml --debug
+  ```
+
+3. Test it!
+
+  ```shell
+  curl --location 'http://0.0.0.0:4000/chat/completions' \
+      --header 'Authorization: Bearer sk-1234' \
+      --header 'Content-Type: application/json' \
+      --data '{
+      "model": "blenderbot",
+      "messages": [
+        {
+            "role": "user",
+            "content": "I like you!"
+        }
+        ],
+  }'
+  ```
+
+
+</TabItem> 
+</Tabs>
+</TabItem>
+<TabItem value="classification" label="Text Classification">
+
+Append `text-classification` to the model name 
+
+e.g. `huggingface/text-classification/<model-name>`
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+import os 
+from litellm import completion 
+
+# [OPTIONAL] set env var
+os.environ["HUGGINGFACE_API_KEY"] = "huggingface_api_key" 
+
+messages = [{ "content": "I like you, I love you!","role": "user"}]
+
+# e.g. Call 'shahrukhx01/question-vs-statement-classifier' hosted on HF Inference endpoints
+response = completion(
+  model="huggingface/text-classification/shahrukhx01/question-vs-statement-classifier", 
+  messages=messages,
+  api_base="https://my-endpoint.endpoints.huggingface.cloud",
+)
+
+print(response)
+```
+</TabItem> 
+<TabItem value="proxy" label="PROXY">
+
+1. Add models to your config.yaml
+
+  ```yaml
+  model_list:
+    - model_name: bert-classifier
+      litellm_params:
+        model: huggingface/text-classification/shahrukhx01/question-vs-statement-classifier
+        api_key: os.environ/HUGGINGFACE_API_KEY
+        api_base: "https://my-endpoint.endpoints.huggingface.cloud"
+  ```
+
+
+
+2. Start the proxy 
+
+  ```bash
+  $ litellm --config /path/to/config.yaml --debug
+  ```
+
+3. Test it!
+
+  ```shell
+  curl --location 'http://0.0.0.0:4000/chat/completions' \
+      --header 'Authorization: Bearer sk-1234' \
+      --header 'Content-Type: application/json' \
+      --data '{
+      "model": "bert-classifier",
+      "messages": [
+        {
+            "role": "user",
+            "content": "I like you!"
+        }
+        ],
+  }'
+  ```
+
+
+</TabItem> 
+</Tabs>
+</TabItem>
+<TabItem value="none" label="Text Generation (NOT TGI)">
+
+Append `text-generation` to the model name 
+
+e.g. `huggingface/text-generation/<model-name>`

 ```python
 import os 
@ -75,7 +245,7 @@ messages = [{ "content": "There's a llama in my garden 😱 What should I do?","

 # e.g. Call 'roneneldan/TinyStories-3M' hosted on HF Inference endpoints
 response = completion(
-  model="huggingface/roneneldan/TinyStories-3M", 
+  model="huggingface/text-generation/roneneldan/TinyStories-3M", 
  messages=messages,
  api_base="https://p69xlsj6rpno5drq.us-east-1.aws.endpoints.huggingface.cloud",
 )
--- a/docs/my-website/docs/providers/mistral.md
+++ b/docs/my-website/docs/providers/mistral.md
@ -44,13 +44,58 @@ for chunk in response:
 ## Supported Models
 All models listed here https://docs.mistral.ai/platform/endpoints are supported. We actively maintain the list of models, pricing, token window, etc. [here](https://github.com/BerriAI/litellm/blob/c1b25538277206b9f00de5254d80d6a83bb19a29/model_prices_and_context_window.json).

-| Model Name               | Function Call                                                                                                                                                      |
-|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| mistral-tiny | `completion(model="mistral/mistral-tiny", messages)` | 
-| mistral-small | `completion(model="mistral/mistral-small", messages)` | 
-| mistral-medium | `completion(model="mistral/mistral-medium", messages)` | 
-| mistral-large-latest | `completion(model="mistral/mistral-large-latest", messages)` | 
+| Model Name     | Function Call                                                |
+|----------------|--------------------------------------------------------------|
+| Mistral Small  | `completion(model="mistral/mistral-small-latest", messages)` |
+| Mistral Medium | `completion(model="mistral/mistral-medium-latest", messages)`|
+| Mistral Large  | `completion(model="mistral/mistral-large-latest", messages)` |
+| Mistral 7B     | `completion(model="mistral/open-mistral-7b", messages)`      |
+| Mixtral 8x7B   | `completion(model="mistral/open-mixtral-8x7b", messages)`    |
+| Mixtral 8x22B  | `completion(model="mistral/open-mixtral-8x22b", messages)`   |

+## Function Calling 
+
+```python
+from litellm import completion
+
+# set env
+os.environ["MISTRAL_API_KEY"] = "your-api-key"
+
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city and state, e.g. San Francisco, CA",
+                    },
+                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+                },
+                "required": ["location"],
+            },
+        },
+    }
+]
+messages = [{"role": "user", "content": "What's the weather like in Boston today?"}]
+
+response = completion(
+    model="mistral/mistral-large-latest",
+    messages=messages,
+    tools=tools,
+    tool_choice="auto",
+)
+# Add any assertions, here to check response args
+print(response)
+assert isinstance(response.choices[0].message.tool_calls[0].function.name, str)
+assert isinstance(
+    response.choices[0].message.tool_calls[0].function.arguments, str
+)
+```

 ## Sample Usage - Embedding
 ```python
@ -71,6 +116,6 @@ All models listed here https://docs.mistral.ai/platform/endpoints are supported

 | Model Name               | Function Call                                                                                                                                                      |
 |--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| mistral-embed | `embedding(model="mistral/mistral-embed", input)` | 
+| Mistral Embeddings | `embedding(model="mistral/mistral-embed", input)` | 


--- a/docs/my-website/docs/providers/ollama.md
+++ b/docs/my-website/docs/providers/ollama.md
@ -102,12 +102,18 @@ Ollama supported models: https://github.com/ollama/ollama
 | Model Name           | Function Call                                                                     |
 |----------------------|-----------------------------------------------------------------------------------
 | Mistral    | `completion(model='ollama/mistral', messages, api_base="http://localhost:11434", stream=True)` |
+| Mistral-7B-Instruct-v0.1 | `completion(model='ollama/mistral-7B-Instruct-v0.1', messages, api_base="http://localhost:11434", stream=False)` |
+| Mistral-7B-Instruct-v0.2 | `completion(model='ollama/mistral-7B-Instruct-v0.2', messages, api_base="http://localhost:11434", stream=False)` |
+| Mixtral-8x7B-Instruct-v0.1 | `completion(model='ollama/mistral-8x7B-Instruct-v0.1', messages, api_base="http://localhost:11434", stream=False)` |
+| Mixtral-8x22B-Instruct-v0.1 | `completion(model='ollama/mixtral-8x22B-Instruct-v0.1', messages, api_base="http://localhost:11434", stream=False)` |
 | Llama2 7B            | `completion(model='ollama/llama2', messages, api_base="http://localhost:11434", stream=True)` | 
 | Llama2 13B           | `completion(model='ollama/llama2:13b', messages, api_base="http://localhost:11434", stream=True)` | 
 | Llama2 70B           | `completion(model='ollama/llama2:70b', messages, api_base="http://localhost:11434", stream=True)` | 
 | Llama2 Uncensored    | `completion(model='ollama/llama2-uncensored', messages, api_base="http://localhost:11434", stream=True)` | 
 | Code Llama    | `completion(model='ollama/codellama', messages, api_base="http://localhost:11434", stream=True)` | 
 | Llama2 Uncensored    | `completion(model='ollama/llama2-uncensored', messages, api_base="http://localhost:11434", stream=True)` |
+|Meta LLaMa3 8B | `completion(model='ollama/llama3', messages, api_base="http://localhost:11434", stream=False)` |
+| Meta LLaMa3 70B | `completion(model='ollama/llama3:70b', messages, api_base="http://localhost:11434", stream=False)` |
 | Orca Mini            | `completion(model='ollama/orca-mini', messages, api_base="http://localhost:11434", stream=True)` |
 | Vicuna               | `completion(model='ollama/vicuna', messages, api_base="http://localhost:11434", stream=True)` |
 | Nous-Hermes          | `completion(model='ollama/nous-hermes', messages, api_base="http://localhost:11434", stream=True)` |
--- a/docs/my-website/docs/providers/openai.md
+++ b/docs/my-website/docs/providers/openai.md
@ -2,7 +2,7 @@ import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

 # OpenAI
-LiteLLM supports OpenAI Chat + Text completion and embedding calls.
+LiteLLM supports OpenAI Chat + Embedding calls.

 ### Required API Keys

@ -20,7 +20,7 @@ os.environ["OPENAI_API_KEY"] = "your-api-key"

 # openai call
 response = completion(
-    model = "gpt-3.5-turbo", 
+    model = "gpt-4o", 
    messages=[{ "content": "Hello, how are you?","role": "user"}]
 )
 ```
@ -163,6 +163,10 @@ os.environ["OPENAI_API_BASE"] = "openaiai-api-base"     # OPTIONAL

 | Model Name            | Function Call                                                   |
 |-----------------------|-----------------------------------------------------------------|
+| gpt-4o   | `response = completion(model="gpt-4o", messages=messages)` |
+| gpt-4o-2024-05-13   | `response = completion(model="gpt-4o-2024-05-13", messages=messages)` |
+| gpt-4-turbo   | `response = completion(model="gpt-4-turbo", messages=messages)` |
+| gpt-4-turbo-preview   | `response = completion(model="gpt-4-0125-preview", messages=messages)` |
 | gpt-4-0125-preview    | `response = completion(model="gpt-4-0125-preview", messages=messages)` |
 | gpt-4-1106-preview    | `response = completion(model="gpt-4-1106-preview", messages=messages)` |
 | gpt-3.5-turbo-1106    | `response = completion(model="gpt-3.5-turbo-1106", messages=messages)` |
@ -184,6 +188,8 @@ These also support the `OPENAI_API_BASE` environment variable, which can be used
 ## OpenAI Vision Models 
 | Model Name            | Function Call                                                   |
 |-----------------------|-----------------------------------------------------------------|
+| gpt-4o   | `response = completion(model="gpt-4o", messages=messages)` |
+| gpt-4-turbo    | `response = completion(model="gpt-4-turbo", messages=messages)` |
 | gpt-4-vision-preview    | `response = completion(model="gpt-4-vision-preview", messages=messages)` |

 #### Usage
@ -217,19 +223,6 @@ response = completion(

 ```

-## OpenAI Text Completion Models / Instruct Models
-
-| Model Name          | Function Call                                      |
-|---------------------|----------------------------------------------------|
-| gpt-3.5-turbo-instruct | `response = completion(model="gpt-3.5-turbo-instruct", messages=messages)` |
-| gpt-3.5-turbo-instruct-0914 | `response = completion(model="gpt-3.5-turbo-instruct-0914", messages=messages)` |
-| text-davinci-003    | `response = completion(model="text-davinci-003", messages=messages)` |
-| ada-001             | `response = completion(model="ada-001", messages=messages)` |
-| curie-001           | `response = completion(model="curie-001", messages=messages)` |
-| babbage-001         | `response = completion(model="babbage-001", messages=messages)` |
-| babbage-002         | `response = completion(model="babbage-002", messages=messages)` |
-| davinci-002         | `response = completion(model="davinci-002", messages=messages)` |
-
 ## Advanced

 ### Parallel Function calling
--- a/docs/my-website/docs/providers/openai_compatible.md
+++ b/docs/my-website/docs/providers/openai_compatible.md
@ -5,7 +5,9 @@ import TabItem from '@theme/TabItem';

 To call models hosted behind an openai proxy, make 2 changes:

-1. Put `openai/` in front of your model name, so litellm knows you're trying to call an openai-compatible endpoint. 
+1. For `/chat/completions`: Put `openai/` in front of your model name, so litellm knows you're trying to call an openai `/chat/completions` endpoint. 
+
+2. For `/completions`: Put `text-completion-openai/` in front of your model name, so litellm knows you're trying to call an openai `/completions` endpoint. 

 2. **Do NOT** add anything additional to the base url e.g. `/v1/embedding`. LiteLLM uses the openai-client to make these calls, and that automatically adds the relevant endpoints. 

--- a/docs/my-website/docs/providers/predibase.md
+++ b/docs/my-website/docs/providers/predibase.md
@ -0,0 +1,247 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# 🆕 Predibase
+
+LiteLLM supports all models on Predibase
+
+
+## Usage
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+### API KEYS
+```python
+import os 
+os.environ["PREDIBASE_API_KEY"] = ""
+```
+
+### Example Call
+
+```python
+from litellm import completion
+import os
+## set ENV variables
+os.environ["PREDIBASE_API_KEY"] = "predibase key"
+os.environ["PREDIBASE_TENANT_ID"] = "predibase tenant id"
+
+# predibase llama-3 call
+response = completion(
+    model="predibase/llama-3-8b-instruct", 
+    messages = [{ "content": "Hello, how are you?","role": "user"}]
+)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Add models to your config.yaml
+
+  ```yaml
+  model_list:
+    - model_name: llama-3
+      litellm_params:
+        model: predibase/llama-3-8b-instruct
+        api_key: os.environ/PREDIBASE_API_KEY
+        tenant_id: os.environ/PREDIBASE_TENANT_ID
+  ```
+
+
+
+2. Start the proxy 
+
+  ```bash
+  $ litellm --config /path/to/config.yaml --debug
+  ```
+
+3. Send Request to LiteLLM Proxy Server
+
+  <Tabs>
+
+  <TabItem value="openai" label="OpenAI Python v1.0.0+">
+
+  ```python
+  import openai
+  client = openai.OpenAI(
+      api_key="sk-1234",             # pass litellm proxy key, if you're using virtual keys
+      base_url="http://0.0.0.0:4000" # litellm-proxy-base url
+  )
+
+  response = client.chat.completions.create(
+      model="llama-3",
+      messages = [
+        {
+            "role": "system",
+            "content": "Be a good human!"
+        },
+        {
+            "role": "user",
+            "content": "What do you know about earth?"
+        }
+    ]
+  )
+
+  print(response)
+  ```
+
+  </TabItem>
+
+  <TabItem value="curl" label="curl">
+
+  ```shell
+  curl --location 'http://0.0.0.0:4000/chat/completions' \
+      --header 'Authorization: Bearer sk-1234' \
+      --header 'Content-Type: application/json' \
+      --data '{
+      "model": "llama-3",
+      "messages": [
+        {
+            "role": "system",
+            "content": "Be a good human!"
+        },
+        {
+            "role": "user",
+            "content": "What do you know about earth?"
+        }
+        ],
+  }'
+  ```
+  </TabItem>
+
+  </Tabs>
+
+
+</TabItem>
+
+</Tabs>
+
+## Advanced Usage - Prompt Formatting 
+
+LiteLLM has prompt template mappings for all `meta-llama` llama3 instruct models. [**See Code**](https://github.com/BerriAI/litellm/blob/4f46b4c3975cd0f72b8c5acb2cb429d23580c18a/litellm/llms/prompt_templates/factory.py#L1360)
+
+To apply a custom prompt template: 
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python 
+import litellm
+
+import os 
+os.environ["PREDIBASE_API_KEY"] = ""
+
+# Create your own custom prompt template 
+litellm.register_prompt_template(
+	    model="togethercomputer/LLaMA-2-7B-32K",
+        initial_prompt_value="You are a good assistant" # [OPTIONAL]
+	    roles={
+            "system": {
+                "pre_message": "[INST] <<SYS>>\n", # [OPTIONAL]
+                "post_message": "\n<</SYS>>\n [/INST]\n" # [OPTIONAL]
+            },
+            "user": { 
+                "pre_message": "[INST] ", # [OPTIONAL]
+                "post_message": " [/INST]" # [OPTIONAL]
+            }, 
+            "assistant": {
+                "pre_message": "\n" # [OPTIONAL]
+                "post_message": "\n" # [OPTIONAL]
+            }
+        }
+        final_prompt_value="Now answer as best you can:" # [OPTIONAL]
+)
+
+def predibase_custom_model():
+    model = "predibase/togethercomputer/LLaMA-2-7B-32K"
+    response = completion(model=model, messages=messages)
+    print(response['choices'][0]['message']['content'])
+    return response
+
+predibase_custom_model()
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+```yaml
+# Model-specific parameters
+model_list:
+  - model_name: mistral-7b # model alias
+    litellm_params: # actual params for litellm.completion()
+      model: "predibase/mistralai/Mistral-7B-Instruct-v0.1" 
+      api_key: os.environ/PREDIBASE_API_KEY
+      initial_prompt_value: "\n"
+      roles: {"system":{"pre_message":"<|im_start|>system\n", "post_message":"<|im_end|>"}, "assistant":{"pre_message":"<|im_start|>assistant\n","post_message":"<|im_end|>"}, "user":{"pre_message":"<|im_start|>user\n","post_message":"<|im_end|>"}}
+      final_prompt_value: "\n"
+      bos_token: "<s>"
+      eos_token: "</s>"
+      max_tokens: 4096
+```
+
+</TabItem>
+
+</Tabs>
+
+## Passing additional params - max_tokens, temperature 
+See all litellm.completion supported params [here](https://docs.litellm.ai/docs/completion/input)
+
+```python
+# !pip install litellm
+from litellm import completion
+import os
+## set ENV variables
+os.environ["PREDIBASE_API_KEY"] = "predibase key"
+
+# predibae llama-3 call
+response = completion(
+    model="predibase/llama3-8b-instruct", 
+    messages = [{ "content": "Hello, how are you?","role": "user"}],
+    max_tokens=20,
+    temperature=0.5
+)
+```
+
+**proxy**
+
+```yaml
+  model_list:
+    - model_name: llama-3
+      litellm_params:
+        model: predibase/llama-3-8b-instruct
+        api_key: os.environ/PREDIBASE_API_KEY
+        max_tokens: 20
+        temperature: 0.5
+```
+
+## Passings Predibase specific params - adapter_id, adapter_source, 
+Send params [not supported by `litellm.completion()`](https://docs.litellm.ai/docs/completion/input) but supported by Predibase by passing them to `litellm.completion`
+
+Example `adapter_id`, `adapter_source` are Predibase specific param - [See List](https://github.com/BerriAI/litellm/blob/8a35354dd6dbf4c2fcefcd6e877b980fcbd68c58/litellm/llms/predibase.py#L54)
+
+```python
+# !pip install litellm
+from litellm import completion
+import os
+## set ENV variables
+os.environ["PREDIBASE_API_KEY"] = "predibase key"
+
+# predibase llama3 call
+response = completion(
+    model="predibase/llama-3-8b-instruct", 
+    messages = [{ "content": "Hello, how are you?","role": "user"}],
+    adapter_id="my_repo/3",
+    adapter_soruce="pbase",
+)
+```
+
+**proxy**
+
+```yaml
+  model_list:
+    - model_name: llama-3
+      litellm_params:
+        model: predibase/llama-3-8b-instruct
+        api_key: os.environ/PREDIBASE_API_KEY
+        adapter_id: my_repo/3
+        adapter_source: pbase
+```
--- a/docs/my-website/docs/providers/replicate.md
+++ b/docs/my-website/docs/providers/replicate.md
@ -1,7 +1,16 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
 # Replicate

 LiteLLM supports all models on Replicate

+
+## Usage
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
 ### API KEYS
 ```python
 import os 
@ -16,14 +25,175 @@ import os
 ## set ENV variables
 os.environ["REPLICATE_API_KEY"] = "replicate key"

-# replicate llama-2 call
+# replicate llama-3 call
 response = completion(
-    model="replicate/llama-2-70b-chat:2796ee9483c3fd7aa2e171d38f4ca12251a30609463dcfd4cd76703f22e96cdf", 
+    model="replicate/meta/meta-llama-3-8b-instruct", 
    messages = [{ "content": "Hello, how are you?","role": "user"}]
 )
 ```

-### Example - Calling Replicate Deployments
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Add models to your config.yaml
+
+  ```yaml
+  model_list:
+    - model_name: llama-3
+      litellm_params:
+        model: replicate/meta/meta-llama-3-8b-instruct
+        api_key: os.environ/REPLICATE_API_KEY
+  ```
+
+
+
+2. Start the proxy 
+
+  ```bash
+  $ litellm --config /path/to/config.yaml --debug
+  ```
+
+3. Send Request to LiteLLM Proxy Server
+
+  <Tabs>
+
+  <TabItem value="openai" label="OpenAI Python v1.0.0+">
+
+  ```python
+  import openai
+  client = openai.OpenAI(
+      api_key="sk-1234",             # pass litellm proxy key, if you're using virtual keys
+      base_url="http://0.0.0.0:4000" # litellm-proxy-base url
+  )
+
+  response = client.chat.completions.create(
+      model="llama-3",
+      messages = [
+        {
+            "role": "system",
+            "content": "Be a good human!"
+        },
+        {
+            "role": "user",
+            "content": "What do you know about earth?"
+        }
+    ]
+  )
+
+  print(response)
+  ```
+
+  </TabItem>
+
+  <TabItem value="curl" label="curl">
+
+  ```shell
+  curl --location 'http://0.0.0.0:4000/chat/completions' \
+      --header 'Authorization: Bearer sk-1234' \
+      --header 'Content-Type: application/json' \
+      --data '{
+      "model": "llama-3",
+      "messages": [
+        {
+            "role": "system",
+            "content": "Be a good human!"
+        },
+        {
+            "role": "user",
+            "content": "What do you know about earth?"
+        }
+        ],
+  }'
+  ```
+  </TabItem>
+
+  </Tabs>
+
+
+### Expected Replicate Call 
+
+This is the call litellm will make to replicate, from the above example: 
+
+```bash
+
+POST Request Sent from LiteLLM:
+curl -X POST \
+https://api.replicate.com/v1/models/meta/meta-llama-3-8b-instruct \
+-H 'Authorization: Token your-api-key' -H 'Content-Type: application/json' \
+-d '{'version': 'meta/meta-llama-3-8b-instruct', 'input': {'prompt': '<|start_header_id|>system<|end_header_id|>\n\nBe a good human!<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat do you know about earth?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'}}'
+```
+
+</TabItem>
+
+</Tabs>
+
+## Advanced Usage - Prompt Formatting 
+
+LiteLLM has prompt template mappings for all `meta-llama` llama3 instruct models. [**See Code**](https://github.com/BerriAI/litellm/blob/4f46b4c3975cd0f72b8c5acb2cb429d23580c18a/litellm/llms/prompt_templates/factory.py#L1360)
+
+To apply a custom prompt template: 
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python 
+import litellm
+
+import os 
+os.environ["REPLICATE_API_KEY"] = ""
+
+# Create your own custom prompt template 
+litellm.register_prompt_template(
+	    model="togethercomputer/LLaMA-2-7B-32K",
+        initial_prompt_value="You are a good assistant" # [OPTIONAL]
+	    roles={
+            "system": {
+                "pre_message": "[INST] <<SYS>>\n", # [OPTIONAL]
+                "post_message": "\n<</SYS>>\n [/INST]\n" # [OPTIONAL]
+            },
+            "user": { 
+                "pre_message": "[INST] ", # [OPTIONAL]
+                "post_message": " [/INST]" # [OPTIONAL]
+            }, 
+            "assistant": {
+                "pre_message": "\n" # [OPTIONAL]
+                "post_message": "\n" # [OPTIONAL]
+            }
+        }
+        final_prompt_value="Now answer as best you can:" # [OPTIONAL]
+)
+
+def test_replicate_custom_model():
+    model = "replicate/togethercomputer/LLaMA-2-7B-32K"
+    response = completion(model=model, messages=messages)
+    print(response['choices'][0]['message']['content'])
+    return response
+
+test_replicate_custom_model()
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+```yaml
+# Model-specific parameters
+model_list:
+  - model_name: mistral-7b # model alias
+    litellm_params: # actual params for litellm.completion()
+      model: "replicate/mistralai/Mistral-7B-Instruct-v0.1" 
+      api_key: os.environ/REPLICATE_API_KEY
+      initial_prompt_value: "\n"
+      roles: {"system":{"pre_message":"<|im_start|>system\n", "post_message":"<|im_end|>"}, "assistant":{"pre_message":"<|im_start|>assistant\n","post_message":"<|im_end|>"}, "user":{"pre_message":"<|im_start|>user\n","post_message":"<|im_end|>"}}
+      final_prompt_value: "\n"
+      bos_token: "<s>"
+      eos_token: "</s>"
+      max_tokens: 4096
+```
+
+</TabItem>
+
+</Tabs>
+
+## Advanced Usage - Calling Replicate Deployments
 Calling a [deployed replicate LLM](https://replicate.com/deployments)
 Add the `replicate/deployments/` prefix to your model, so litellm will call the `deployments` endpoint. This will call `ishaan-jaff/ishaan-mistral` deployment on replicate

@ -40,7 +210,7 @@ Replicate responses can take 3-5 mins due to replicate cold boots, if you're try

 :::

-### Replicate Models
+## Replicate Models
 liteLLM supports all replicate LLMs

 For replicate models ensure to add a `replicate/` prefix to the `model` arg. liteLLM detects it using this arg. 
@ -49,15 +219,15 @@ Below are examples on how to call replicate LLMs using liteLLM

 Model Name                  | Function Call                                                  | Required OS Variables                |
 -----------------------------|----------------------------------------------------------------|--------------------------------------|
- replicate/llama-2-70b-chat | `completion(model='replicate/llama-2-70b-chat:2796ee9483c3fd7aa2e171d38f4ca12251a30609463dcfd4cd76703f22e96cdf', messages, supports_system_prompt=True)` | `os.environ['REPLICATE_API_KEY']`    |
- a16z-infra/llama-2-13b-chat| `completion(model='replicate/a16z-infra/llama-2-13b-chat:2a7f981751ec7fdf87b5b91ad4db53683a98082e9ff7bfd12c8cd5ea85980a52', messages, supports_system_prompt=True)`| `os.environ['REPLICATE_API_KEY']`    |
+ replicate/llama-2-70b-chat | `completion(model='replicate/llama-2-70b-chat:2796ee9483c3fd7aa2e171d38f4ca12251a30609463dcfd4cd76703f22e96cdf', messages)` | `os.environ['REPLICATE_API_KEY']`    |
+ a16z-infra/llama-2-13b-chat| `completion(model='replicate/a16z-infra/llama-2-13b-chat:2a7f981751ec7fdf87b5b91ad4db53683a98082e9ff7bfd12c8cd5ea85980a52', messages)`| `os.environ['REPLICATE_API_KEY']`    |
 replicate/vicuna-13b  | `completion(model='replicate/vicuna-13b:6282abe6a492de4145d7bb601023762212f9ddbbe78278bd6771c8b3b2f2a13b', messages)` | `os.environ['REPLICATE_API_KEY']` |
 daanelson/flan-t5-large    | `completion(model='replicate/daanelson/flan-t5-large:ce962b3f6792a57074a601d3979db5839697add2e4e02696b3ced4c022d4767f', messages)`    | `os.environ['REPLICATE_API_KEY']`    |
 custom-llm    | `completion(model='replicate/custom-llm-version-id', messages)`    | `os.environ['REPLICATE_API_KEY']`    |
  replicate deployment    | `completion(model='replicate/deployments/ishaan-jaff/ishaan-mistral', messages)`    | `os.environ['REPLICATE_API_KEY']`    |


-### Passing additional params - max_tokens, temperature 
+## Passing additional params - max_tokens, temperature 
 See all litellm.completion supported params [here](https://docs.litellm.ai/docs/completion/input)

 ```python
@ -73,11 +243,22 @@ response = completion(
    messages = [{ "content": "Hello, how are you?","role": "user"}],
    max_tokens=20,
    temperature=0.5
-
 )
 ```

-### Passings Replicate specific params
+**proxy**
+
+```yaml
+  model_list:
+    - model_name: llama-3
+      litellm_params:
+        model: replicate/meta/meta-llama-3-8b-instruct
+        api_key: os.environ/REPLICATE_API_KEY
+        max_tokens: 20
+        temperature: 0.5
+```
+
+## Passings Replicate specific params
 Send params [not supported by `litellm.completion()`](https://docs.litellm.ai/docs/completion/input) but supported by Replicate by passing them to `litellm.completion`

 Example `seed`, `min_tokens` are Replicate specific param
@ -98,3 +279,15 @@ response = completion(
    top_k=20,
 )
 ```
+
+**proxy**
+
+```yaml
+  model_list:
+    - model_name: llama-3
+      litellm_params:
+        model: replicate/meta/meta-llama-3-8b-instruct
+        api_key: os.environ/REPLICATE_API_KEY
+        min_tokens: 2
+        top_k: 20
+```
--- a/docs/my-website/docs/providers/text_completion_openai.md
+++ b/docs/my-website/docs/providers/text_completion_openai.md
@ -0,0 +1,163 @@
+# OpenAI (Text Completion)
+
+LiteLLM supports OpenAI text completion models
+
+### Required API Keys
+
+```python
+import os 
+os.environ["OPENAI_API_KEY"] = "your-api-key"
+```
+
+### Usage
+```python
+import os 
+from litellm import completion
+
+os.environ["OPENAI_API_KEY"] = "your-api-key"
+
+# openai call
+response = completion(
+    model = "gpt-3.5-turbo-instruct", 
+    messages=[{ "content": "Hello, how are you?","role": "user"}]
+)
+```
+
+### Usage - LiteLLM Proxy Server
+
+Here's how to call OpenAI models with the LiteLLM Proxy Server
+
+### 1. Save key in your environment
+
+```bash
+export OPENAI_API_KEY=""
+```
+
+### 2. Start the proxy 
+
+<Tabs>
+<TabItem value="config" label="config.yaml">
+
+```yaml
+model_list:
+  - model_name: gpt-3.5-turbo
+    litellm_params:
+      model: openai/gpt-3.5-turbo                          # The `openai/` prefix will call openai.chat.completions.create
+      api_key: os.environ/OPENAI_API_KEY
+  - model_name: gpt-3.5-turbo-instruct
+    litellm_params:
+      model: text-completion-openai/gpt-3.5-turbo-instruct # The `text-completion-openai/` prefix will call openai.completions.create
+      api_key: os.environ/OPENAI_API_KEY
+```
+</TabItem>
+<TabItem value="config-*" label="config.yaml - proxy all OpenAI models">
+
+Use this to add all openai models with one API Key. **WARNING: This will not do any load balancing**
+This means requests to `gpt-4`, `gpt-3.5-turbo` , `gpt-4-turbo-preview` will all go through this route 
+
+```yaml
+model_list:
+  - model_name: "*"             # all requests where model not in your config go to this deployment
+    litellm_params:
+      model: openai/*           # set `openai/` to use the openai route
+      api_key: os.environ/OPENAI_API_KEY
+```
+</TabItem>
+<TabItem value="cli" label="CLI">
+
+```bash
+$ litellm --model gpt-3.5-turbo-instruct
+
+# Server running on http://0.0.0.0:4000
+```
+</TabItem>
+
+</Tabs>
+
+### 3. Test it
+
+
+<Tabs>
+<TabItem value="Curl" label="Curl Request">
+
+```shell
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+--header 'Content-Type: application/json' \
+--data ' {
+      "model": "gpt-3.5-turbo-instruct",
+      "messages": [
+        {
+          "role": "user",
+          "content": "what llm are you"
+        }
+      ]
+    }
+'
+```
+</TabItem>
+<TabItem value="openai" label="OpenAI v1.0.0+">
+
+```python
+import openai
+client = openai.OpenAI(
+    api_key="anything",
+    base_url="http://0.0.0.0:4000"
+)
+
+# request sent to model set on litellm proxy, `litellm --model`
+response = client.chat.completions.create(model="gpt-3.5-turbo-instruct", messages = [
+    {
+        "role": "user",
+        "content": "this is a test request, write a short poem"
+    }
+])
+
+print(response)
+
+```
+</TabItem>
+<TabItem value="langchain" label="Langchain">
+
+```python
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts.chat import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+    SystemMessagePromptTemplate,
+)
+from langchain.schema import HumanMessage, SystemMessage
+
+chat = ChatOpenAI(
+    openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
+    model = "gpt-3.5-turbo-instruct",
+    temperature=0.1
+)
+
+messages = [
+    SystemMessage(
+        content="You are a helpful assistant that im using to make a test request to."
+    ),
+    HumanMessage(
+        content="test from litellm. tell me why it's amazing in 1 sentence"
+    ),
+]
+response = chat(messages)
+
+print(response)
+```
+</TabItem>
+</Tabs>
+
+
+## OpenAI Text Completion Models / Instruct Models
+
+| Model Name          | Function Call                                      |
+|---------------------|----------------------------------------------------|
+| gpt-3.5-turbo-instruct | `response = completion(model="gpt-3.5-turbo-instruct", messages=messages)` |
+| gpt-3.5-turbo-instruct-0914 | `response = completion(model="gpt-3.5-turbo-instruct-0914", messages=messages)` |
+| text-davinci-003    | `response = completion(model="text-davinci-003", messages=messages)` |
+| ada-001             | `response = completion(model="ada-001", messages=messages)` |
+| curie-001           | `response = completion(model="curie-001", messages=messages)` |
+| babbage-001         | `response = completion(model="babbage-001", messages=messages)` |
+| babbage-002         | `response = completion(model="babbage-002", messages=messages)` |
+| davinci-002         | `response = completion(model="davinci-002", messages=messages)` |
--- a/docs/my-website/docs/providers/triton-inference-server.md
+++ b/docs/my-website/docs/providers/triton-inference-server.md
@ -0,0 +1,95 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Triton Inference Server
+
+LiteLLM supports Embedding Models on Triton Inference Servers
+
+
+## Usage
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+
+### Example Call
+
+Use the `triton/` prefix to route to triton server
+```python
+from litellm import embedding
+import os
+
+response = await litellm.aembedding(
+    model="triton/<your-triton-model>",                                                       
+    api_base="https://your-triton-api-base/triton/embeddings", # /embeddings endpoint you want litellm to call on your server
+    input=["good morning from litellm"],
+)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Add models to your config.yaml
+
+  ```yaml
+  model_list:
+    - model_name: my-triton-model
+      litellm_params:
+        model: triton/<your-triton-model>"
+        api_base: https://your-triton-api-base/triton/embeddings
+  ```
+
+
+2. Start the proxy 
+
+  ```bash
+  $ litellm --config /path/to/config.yaml --detailed_debug
+  ```
+
+3. Send Request to LiteLLM Proxy Server
+
+  <Tabs>
+
+  <TabItem value="openai" label="OpenAI Python v1.0.0+">
+
+    ```python
+    import openai
+    from openai import OpenAI
+
+    # set base_url to your proxy server
+    # set api_key to send to proxy server
+    client = OpenAI(api_key="<proxy-api-key>", base_url="http://0.0.0.0:4000")
+
+    response = client.embeddings.create(
+        input=["hello from litellm"],
+        model="my-triton-model"
+    )
+
+    print(response)
+
+    ```
+
+  </TabItem>
+
+  <TabItem value="curl" label="curl">
+
+  `--header` is optional, only required if you're using litellm proxy with Virtual Keys
+
+    ```shell
+    curl --location 'http://0.0.0.0:4000/embeddings' \
+    --header 'Content-Type: application/json' \
+    --header 'Authorization: Bearer sk-1234' \
+    --data ' {
+    "model": "my-triton-model",
+    "input": ["write a litellm poem"]
+    }'
+
+    ```
+  </TabItem>
+
+  </Tabs>
+
+
+</TabItem>
+
+</Tabs>
--- a/docs/my-website/docs/providers/vertex.md
+++ b/docs/my-website/docs/providers/vertex.md
@ -253,6 +253,7 @@ litellm.vertex_location = "us-central1 # Your Location
 ## Anthropic 
 | Model Name       | Function Call                        |
 |------------------|--------------------------------------|
+| claude-3-opus@20240229   | `completion('vertex_ai/claude-3-opus@20240229', messages)` |
 | claude-3-sonnet@20240229   | `completion('vertex_ai/claude-3-sonnet@20240229', messages)` |
 | claude-3-haiku@20240307   | `completion('vertex_ai/claude-3-haiku@20240307', messages)` |

@ -363,6 +364,8 @@ response = completion(
 | Model Name       | Function Call                        |
 |------------------|--------------------------------------|
 | gemini-1.5-pro   | `completion('gemini-1.5-pro', messages)`, `completion('vertex_ai/gemini-pro', messages)` |
+| gemini-1.5-flash-preview-0514   | `completion('gemini-1.5-flash-preview-0514', messages)`, `completion('vertex_ai/gemini-pro', messages)` |
+| gemini-1.5-pro-preview-0514   | `completion('gemini-1.5-pro-preview-0514', messages)`, `completion('vertex_ai/gemini-1.5-pro-preview-0514', messages)` |



@ -476,6 +479,36 @@ print(response)
 | code-gecko@latest| `completion('code-gecko@latest', messages)` |


+## Embedding Models
+
+#### Usage - Embedding
+```python
+import litellm
+from litellm import embedding
+litellm.vertex_project = "hardy-device-38811" # Your Project ID
+litellm.vertex_location = "us-central1"  # proj location
+
+response = embedding(
+    model="vertex_ai/textembedding-gecko",
+    input=["good morning from litellm"],
+)
+print(response)
+```
+
+#### Supported Embedding Models
+All models listed [here](https://github.com/BerriAI/litellm/blob/57f37f743886a0249f630a6792d49dffc2c5d9b7/model_prices_and_context_window.json#L835) are supported
+
+| Model Name               | Function Call                                                                                                                                                      |
+|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| textembedding-gecko | `embedding(model="vertex_ai/textembedding-gecko", input)` | 
+| textembedding-gecko-multilingual | `embedding(model="vertex_ai/textembedding-gecko-multilingual", input)` | 
+| textembedding-gecko-multilingual@001 | `embedding(model="vertex_ai/textembedding-gecko-multilingual@001", input)` | 
+| textembedding-gecko@001 | `embedding(model="vertex_ai/textembedding-gecko@001", input)` | 
+| textembedding-gecko@003 | `embedding(model="vertex_ai/textembedding-gecko@003", input)` | 
+| text-embedding-preview-0409 | `embedding(model="vertex_ai/text-embedding-preview-0409", input)` |
+| text-multilingual-embedding-preview-0409 | `embedding(model="vertex_ai/text-multilingual-embedding-preview-0409", input)` | 
+
+
 ## Extra

 ### Using `GOOGLE_APPLICATION_CREDENTIALS`
@ -519,6 +552,12 @@ def load_vertex_ai_credentials():

 ### Using GCP Service Account 

+:::info
+
+Trying to deploy LiteLLM on Google Cloud Run? Tutorial [here](https://docs.litellm.ai/docs/proxy/deploy#deploy-on-google-cloud-run)
+
+:::
+
 1. Figure out the Service Account bound to the Google Cloud Run service

 <Image img={require('../../img/gcp_acc_1.png')} />
--- a/docs/my-website/docs/providers/vllm.md
+++ b/docs/my-website/docs/providers/vllm.md
@ -4,6 +4,13 @@ LiteLLM supports all models on VLLM.

 🚀[Code Tutorial](https://github.com/BerriAI/litellm/blob/main/cookbook/VLLM_Model_Testing.ipynb)

+
+:::info
+
+To call a HOSTED VLLM Endpoint use [these docs](./openai_compatible.md)
+
+:::
+
 ### Quick Start
 ```
 pip install litellm vllm
--- a/docs/my-website/docs/providers/watsonx.md
+++ b/docs/my-website/docs/providers/watsonx.md
@ -0,0 +1,284 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# IBM watsonx.ai
+
+LiteLLM supports all IBM [watsonx.ai](https://watsonx.ai/) foundational models and embeddings.
+
+## Environment Variables
+```python
+os.environ["WATSONX_URL"] = ""  # (required) Base URL of your WatsonX instance
+# (required) either one of the following:
+os.environ["WATSONX_APIKEY"] = "" # IBM cloud API key
+os.environ["WATSONX_TOKEN"] = "" # IAM auth token
+# optional - can also be passed as params to completion() or embedding()
+os.environ["WATSONX_PROJECT_ID"] = "" # Project ID of your WatsonX instance
+os.environ["WATSONX_DEPLOYMENT_SPACE_ID"] = "" # ID of your deployment space to use deployed models
+```
+
+See [here](https://cloud.ibm.com/apidocs/watsonx-ai#api-authentication) for more information on how to get an access token to authenticate to watsonx.ai.
+
+## Usage
+
+<a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/liteLLM_IBM_Watsonx.ipynb">
+  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
+</a>
+
+```python
+import os
+from litellm import completion
+
+os.environ["WATSONX_URL"] = ""
+os.environ["WATSONX_APIKEY"] = ""
+
+response = completion(
+  model="watsonx/ibm/granite-13b-chat-v2",
+  messages=[{ "content": "what is your favorite colour?","role": "user"}],
+  project_id="<my-project-id>" # or pass with os.environ["WATSONX_PROJECT_ID"]
+)
+
+response = completion(
+  model="watsonx/meta-llama/llama-3-8b-instruct",
+  messages=[{ "content": "what is your favorite colour?","role": "user"}],
+  project_id="<my-project-id>"
+)
+```
+
+## Usage - Streaming
+```python
+import os
+from litellm import completion
+
+os.environ["WATSONX_URL"] = ""
+os.environ["WATSONX_APIKEY"] = ""
+os.environ["WATSONX_PROJECT_ID"] = ""
+
+response = completion(
+  model="watsonx/ibm/granite-13b-chat-v2",
+  messages=[{ "content": "what is your favorite colour?","role": "user"}],
+  stream=True
+)
+for chunk in response:
+  print(chunk)
+```
+
+#### Example Streaming Output Chunk
+```json
+{
+  "choices": [
+    {
+      "finish_reason": null,
+      "index": 0,
+      "delta": {
+        "content": "I don't have a favorite color, but I do like the color blue. What's your favorite color?"
+      }
+    }
+  ],
+  "created": null,
+  "model": "watsonx/ibm/granite-13b-chat-v2",
+  "usage": {
+    "prompt_tokens": null,
+    "completion_tokens": null,
+    "total_tokens": null
+  }
+}
+```
+
+## Usage - Models in deployment spaces
+
+Models that have been deployed to a deployment space (e.g.: tuned models) can be called using the `deployment/<deployment_id>` format (where `<deployment_id>` is the ID of the deployed model in your deployment space). 
+
+The ID of your deployment space must also be set in the environment variable `WATSONX_DEPLOYMENT_SPACE_ID` or passed to the function as `space_id=<deployment_space_id>`. 
+
+```python
+import litellm
+response = litellm.completion(
+    model="watsonx/deployment/<deployment_id>",
+    messages=[{"content": "Hello, how are you?", "role": "user"}],
+    space_id="<deployment_space_id>"
+)
+```
+
+## Usage - Embeddings
+
+LiteLLM also supports making requests to IBM watsonx.ai embedding models. The credential needed for this is the same as for completion.
+
+```python
+from litellm import embedding
+
+response = embedding(
+    model="watsonx/ibm/slate-30m-english-rtrvr",
+    input=["What is the capital of France?"],
+    project_id="<my-project-id>"
+)
+print(response)
+# EmbeddingResponse(model='ibm/slate-30m-english-rtrvr', data=[{'object': 'embedding', 'index': 0, 'embedding': [-0.037463713, -0.02141933, -0.02851813, 0.015519324, ..., -0.0021367231, -0.01704561, -0.001425816, 0.0035238306]}], object='list', usage=Usage(prompt_tokens=8, total_tokens=8))
+```
+
+## OpenAI Proxy Usage 
+
+Here's how to call IBM watsonx.ai with the LiteLLM Proxy Server
+
+### 1. Save keys in your environment
+
+```bash
+export WATSONX_URL=""
+export WATSONX_APIKEY=""
+export WATSONX_PROJECT_ID=""
+```
+
+### 2. Start the proxy 
+
+<Tabs>
+<TabItem value="cli" label="CLI">
+
+```bash
+$ litellm --model watsonx/meta-llama/llama-3-8b-instruct
+
+# Server running on http://0.0.0.0:4000
+```
+
+</TabItem>
+<TabItem value="config" label="config.yaml">
+
+```yaml
+model_list:
+  - model_name: llama-3-8b
+    litellm_params:
+      # all params accepted by litellm.completion()
+      model: watsonx/meta-llama/llama-3-8b-instruct
+      api_key: "os.environ/WATSONX_API_KEY" # does os.getenv("WATSONX_API_KEY")
+```
+</TabItem>
+</Tabs>
+
+### 3. Test it
+
+
+<Tabs>
+<TabItem value="Curl" label="Curl Request">
+
+```shell
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+--header 'Content-Type: application/json' \
+--data ' {
+      "model": "llama-3-8b",
+      "messages": [
+        {
+          "role": "user",
+          "content": "what is your favorite colour?"
+        }
+      ]
+    }
+'
+```
+</TabItem>
+<TabItem value="openai" label="OpenAI v1.0.0+">
+
+```python
+import openai
+client = openai.OpenAI(
+    api_key="anything",
+    base_url="http://0.0.0.0:4000"
+)
+
+# request sent to model set on litellm proxy, `litellm --model`
+response = client.chat.completions.create(model="llama-3-8b", messages=[
+    {
+        "role": "user",
+        "content": "what is your favorite colour?"
+    }
+])
+
+print(response)
+
+```
+</TabItem>
+<TabItem value="langchain" label="Langchain">
+
+```python
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts.chat import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+    SystemMessagePromptTemplate,
+)
+from langchain.schema import HumanMessage, SystemMessage
+
+chat = ChatOpenAI(
+    openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
+    model = "llama-3-8b",
+    temperature=0.1
+)
+
+messages = [
+    SystemMessage(
+        content="You are a helpful assistant that im using to make a test request to."
+    ),
+    HumanMessage(
+        content="test from litellm. tell me why it's amazing in 1 sentence"
+    ),
+]
+response = chat(messages)
+
+print(response)
+```
+</TabItem>
+</Tabs>
+
+
+## Authentication
+
+### Passing credentials as parameters
+
+You can also pass the credentials as parameters to the completion and embedding functions.
+
+```python
+import os
+from litellm import completion
+
+response = completion(
+            model="watsonx/ibm/granite-13b-chat-v2",
+            messages=[{ "content": "What is your favorite color?","role": "user"}],
+            url="",
+            api_key="",
+            project_id=""
+)
+```
+
+
+## Supported IBM watsonx.ai Models
+
+Here are some examples of models available in IBM watsonx.ai that you can use with LiteLLM:
+
+| Mode Name | Command |
+| ---------- | --------- |
+| Flan T5 XXL | `completion(model=watsonx/google/flan-t5-xxl, messages=messages)` |
+| Flan Ul2 | `completion(model=watsonx/google/flan-ul2, messages=messages)` |
+| Mt0 XXL | `completion(model=watsonx/bigscience/mt0-xxl, messages=messages)` |
+| Gpt Neox | `completion(model=watsonx/eleutherai/gpt-neox-20b, messages=messages)` |
+| Mpt 7B Instruct2 | `completion(model=watsonx/ibm/mpt-7b-instruct2, messages=messages)` |
+| Starcoder | `completion(model=watsonx/bigcode/starcoder, messages=messages)` |
+| Llama 2 70B Chat | `completion(model=watsonx/meta-llama/llama-2-70b-chat, messages=messages)` |
+| Llama 2 13B Chat | `completion(model=watsonx/meta-llama/llama-2-13b-chat, messages=messages)` |
+| Granite 13B Instruct | `completion(model=watsonx/ibm/granite-13b-instruct-v1, messages=messages)` |
+| Granite 13B Chat | `completion(model=watsonx/ibm/granite-13b-chat-v1, messages=messages)` |
+| Flan T5 XL | `completion(model=watsonx/google/flan-t5-xl, messages=messages)` |
+| Granite 13B Chat V2 | `completion(model=watsonx/ibm/granite-13b-chat-v2, messages=messages)` |
+| Granite 13B Instruct V2 | `completion(model=watsonx/ibm/granite-13b-instruct-v2, messages=messages)` |
+| Elyza Japanese Llama 2 7B Instruct | `completion(model=watsonx/elyza/elyza-japanese-llama-2-7b-instruct, messages=messages)` |
+| Mixtral 8X7B Instruct V01 Q | `completion(model=watsonx/ibm-mistralai/mixtral-8x7b-instruct-v01-q, messages=messages)` |
+
+
+For a list of all available models in watsonx.ai, see [here](https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-models.html?context=wx&locale=en&audience=wdp).
+
+
+## Supported IBM watsonx.ai Embedding Models
+
+| Model Name           | Function Call                               |
+|----------------------|---------------------------------------------|
+| Slate 30m | `embedding(model="watsonx/ibm/slate-30m-english-rtrvr", input=input)` |
+| Slate 125m  | `embedding(model="watsonx/ibm/slate-125m-english-rtrvr", input=input)` |
+
+
+For a list of all available embedding models in watsonx.ai, see [here](https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-models-embed.html?context=wx).
--- a/docs/my-website/docs/proxy/alerting.md
+++ b/docs/my-website/docs/proxy/alerting.md
@ -1,13 +1,18 @@
-# Slack Alerting
+# 🚨 Alerting 

 Get alerts for:
- hanging LLM api calls
- failed LLM api calls
- slow LLM api calls
- budget Tracking per key/user:
-    - When a User/Key crosses their Budget 
-    - When a User/Key is 15% away from crossing their Budget
- failed db read/writes
+
+- Hanging LLM api calls
+- Slow LLM api calls
+- Failed LLM api calls
+- Budget Tracking per key/user
+- Spend Reports - Weekly & Monthly spend per Team, Tag
+- Failed db read/writes
+- Daily Reports:
+    - **LLM** Top 5 slowest deployments
+    - **LLM** Top 5 deployments with most failed requests
+- **Spend** Weekly & Monthly spend per Team, Tag
+

 ## Quick Start

@ -17,10 +22,12 @@ Set up a slack alert channel to receive alerts from proxy.

 Get a slack webhook url from https://api.slack.com/messaging/webhooks

+You can also use Discord Webhooks, see [here](#using-discord-webhooks)

 ### Step 2: Update config.yaml 

-Let's save a bad key to our proxy
+- Set `SLACK_WEBHOOK_URL` in your proxy env to enable Slack alerts.
+- Just for testing purposes, let's save a bad key to our proxy.

 ```yaml
 model_list: 
@ -33,16 +40,88 @@ general_settings:
    alerting: ["slack"]
    alerting_threshold: 300 # sends alerts if requests hang for 5min+ and responses take 5min+ 

+environment_variables:
+    SLACK_WEBHOOK_URL: "https://hooks.slack.com/services/<>/<>/<>"
+    SLACK_DAILY_REPORT_FREQUENCY: "86400"  # 24 hours; Optional: defaults to 12 hours
 ```

-Set `SLACK_WEBHOOK_URL` in your proxy env
-
-```shell
-SLACK_WEBHOOK_URL: "https://hooks.slack.com/services/<>/<>/<>"
-```

 ### Step 3: Start proxy

 ```bash
 $ litellm --config /path/to/config.yaml
 ```
+
+## Testing Alerting is Setup Correctly
+
+Make a GET request to `/health/services`, expect to see a test slack alert in your provided webhook slack channel
+
+```shell
+curl -X GET 'http://localhost:4000/health/services?service=slack' \
+  -H 'Authorization: Bearer sk-1234'
+```
+
+## Advanced
+### Opting into specific alert types
+
+Set `alert_types` if you want to Opt into only specific alert types
+
+```shell
+general_settings:
+  alerting: ["slack"]
+  alert_types: ["spend_reports"] 
+```
+
+All Possible Alert Types
+
+```python
+alert_types: 
+Optional[
+List[
+    Literal[
+        "llm_exceptions",
+        "llm_too_slow",
+        "llm_requests_hanging",
+        "budget_alerts",
+        "db_exceptions",
+        "daily_reports",
+        "spend_reports",
+        "cooldown_deployment",
+        "new_model_added",
+    ]
+]
+```
+
+
+### Using Discord Webhooks
+
+Discord provides a slack compatible webhook url that you can use for alerting
+
+##### Quick Start
+
+1. Get a webhook url for your discord channel 
+
+2. Append `/slack` to your discord webhook - it should look like
+
+```
+"https://discord.com/api/webhooks/1240030362193760286/cTLWt5ATn1gKmcy_982rl5xmYHsrM1IWJdmCL1AyOmU9JdQXazrp8L1_PYgUtgxj8x4f/slack"
+```
+
+3. Add it to your litellm config 
+
+```yaml
+model_list: 
+    model_name: "azure-model"
+    litellm_params:
+        model: "azure/gpt-35-turbo"
+        api_key: "my-bad-key" # 👈 bad key
+
+general_settings: 
+    alerting: ["slack"]
+    alerting_threshold: 300 # sends alerts if requests hang for 5min+ and responses take 5min+ 
+
+environment_variables:
+    SLACK_WEBHOOK_URL: "https://discord.com/api/webhooks/1240030362193760286/cTLWt5ATn1gKmcy_982rl5xmYHsrM1IWJdmCL1AyOmU9JdQXazrp8L1_PYgUtgxj8x4f/slack"
+```
+
+That's it ! You're ready to go !
--- a/docs/my-website/docs/proxy/billing.md
+++ b/docs/my-website/docs/proxy/billing.md
@ -0,0 +1,319 @@
+import Image from '@theme/IdealImage';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# 💵 Billing
+
+Bill internal teams, external customers for their usage
+
+**🚨 Requirements**
+- [Setup Lago](https://docs.getlago.com/guide/self-hosted/docker#run-the-app), for usage-based billing. We recommend following [their Stripe tutorial](https://docs.getlago.com/templates/per-transaction/stripe#step-1-create-billable-metrics-for-transaction)
+
+Steps:
+- Connect the proxy to Lago
+- Set the id you want to bill for (customers, internal users, teams)
+- Start! 
+
+## Quick Start
+
+Bill internal teams for their usage
+
+### 1. Connect proxy to Lago 
+
+Set 'lago' as a callback on your proxy config.yaml
+
+```yaml
+model_name:
+  - model_name: fake-openai-endpoint
+    litellm_params:
+      model: openai/fake
+      api_key: fake-key
+      api_base: https://exampleopenaiendpoint-production.up.railway.app/
+
+litellm_settings:
+  callbacks: ["lago"] # 👈 KEY CHANGE
+
+general_settings:
+  master_key: sk-1234
+```
+
+Add your Lago keys to the environment
+
+```bash
+export LAGO_API_BASE="http://localhost:3000" # self-host - https://docs.getlago.com/guide/self-hosted/docker#run-the-app
+export LAGO_API_KEY="3e29d607-de54-49aa-a019-ecf585729070" # Get key - https://docs.getlago.com/guide/self-hosted/docker#find-your-api-key
+export LAGO_API_EVENT_CODE="openai_tokens" # name of lago billing code
+export LAGO_API_CHARGE_BY="team_id" # 👈 Charges 'team_id' attached to proxy key
+```
+
+Start proxy 
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+### 2. Create Key for Internal Team 
+
+```bash
+curl 'http://0.0.0.0:4000/key/generate' \
+--header 'Authorization: Bearer sk-1234' \
+--header 'Content-Type: application/json' \
+--data-raw '{"team_id": "my-unique-id"}' # 👈 Internal Team's ID
+```
+
+Response Object:
+
+```bash
+{
+  "key": "sk-tXL0wt5-lOOVK9sfY2UacA",
+}
+```
+
+
+### 3. Start billing! 
+
+<Tabs>
+<TabItem value="curl" label="Curl">
+
+```bash
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+--header 'Content-Type: application/json' \
+--header 'Authorization: Bearer sk-tXL0wt5-lOOVK9sfY2UacA' \ # 👈 Team's Key
+--data ' {
+      "model": "fake-openai-endpoint",
+      "messages": [
+        {
+          "role": "user",
+          "content": "what llm are you"
+        }
+      ],
+    }
+'
+```
+</TabItem>
+<TabItem value="openai_python" label="OpenAI Python SDK">
+
+```python
+import openai
+client = openai.OpenAI(
+    api_key="sk-tXL0wt5-lOOVK9sfY2UacA", # 👈 Team's Key
+    base_url="http://0.0.0.0:4000"
+)
+
+# request sent to model set on litellm proxy, `litellm --model`
+response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
+    {
+        "role": "user",
+        "content": "this is a test request, write a short poem"
+    }
+])
+
+print(response)
+```
+</TabItem>
+<TabItem value="langchain" label="Langchain">
+
+```python
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts.chat import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+    SystemMessagePromptTemplate,
+)
+from langchain.schema import HumanMessage, SystemMessage
+import os 
+
+os.environ["OPENAI_API_KEY"] = "sk-tXL0wt5-lOOVK9sfY2UacA" # 👈 Team's Key
+
+chat = ChatOpenAI(
+    openai_api_base="http://0.0.0.0:4000",
+    model = "gpt-3.5-turbo",
+    temperature=0.1,
+)
+
+messages = [
+    SystemMessage(
+        content="You are a helpful assistant that im using to make a test request to."
+    ),
+    HumanMessage(
+        content="test from litellm. tell me why it's amazing in 1 sentence"
+    ),
+]
+response = chat(messages)
+
+print(response)
+```
+</TabItem>
+</Tabs>
+
+**See Results on Lago**
+
+
+<Image img={require('../../img/lago_2.png')}  style={{ width: '500px', height: 'auto' }} />
+
+## Advanced - Lago Logging object 
+
+This is what LiteLLM will log to Lagos
+
+```
+{
+    "event": {
+      "transaction_id": "<generated_unique_id>",
+      "external_customer_id": <selected_id>, # either 'end_user_id', 'user_id', or 'team_id'. Default 'end_user_id'. 
+      "code": os.getenv("LAGO_API_EVENT_CODE"), 
+      "properties": {
+          "input_tokens": <number>,
+          "output_tokens": <number>,
+          "model": <string>,
+          "response_cost": <number>, # 👈 LITELLM CALCULATED RESPONSE COST - https://github.com/BerriAI/litellm/blob/d43f75150a65f91f60dc2c0c9462ce3ffc713c1f/litellm/utils.py#L1473
+      }
+    }
+}
+```
+
+## Advanced - Bill Customers, Internal Users 
+
+For:
+- Customers (id passed via 'user' param in /chat/completion call) = 'end_user_id'
+- Internal Users (id set when [creating keys](https://docs.litellm.ai/docs/proxy/virtual_keys#advanced---spend-tracking)) = 'user_id' 
+- Teams (id set when [creating keys](https://docs.litellm.ai/docs/proxy/virtual_keys#advanced---spend-tracking)) = 'team_id' 
+
+
+
+<Tabs>
+<TabItem value="customers" label="Customer Billing">
+
+1. Set 'LAGO_API_CHARGE_BY' to 'end_user_id'
+
+  ```bash
+  export LAGO_API_CHARGE_BY="end_user_id"
+  ```
+
+2. Test it!
+
+  <Tabs>
+  <TabItem value="curl" label="Curl">
+
+  ```shell
+  curl --location 'http://0.0.0.0:4000/chat/completions' \
+  --header 'Content-Type: application/json' \
+  --data ' {
+        "model": "gpt-3.5-turbo",
+        "messages": [
+          {
+            "role": "user",
+            "content": "what llm are you"
+          }
+        ],
+        "user": "my_customer_id" # 👈 whatever your customer id is
+      }
+  '
+  ```
+  </TabItem>
+  <TabItem value="openai_sdk" label="OpenAI Python SDK">
+
+  ```python
+  import openai
+  client = openai.OpenAI(
+      api_key="anything",
+      base_url="http://0.0.0.0:4000"
+  )
+
+  # request sent to model set on litellm proxy, `litellm --model`
+  response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
+      {
+          "role": "user",
+          "content": "this is a test request, write a short poem"
+      }
+  ], user="my_customer_id") # 👈 whatever your customer id is
+
+  print(response)
+  ```
+
+  </TabItem>
+  <TabItem value="langchain" label="Langchain">
+
+  ```python
+  from langchain.chat_models import ChatOpenAI
+  from langchain.prompts.chat import (
+      ChatPromptTemplate,
+      HumanMessagePromptTemplate,
+      SystemMessagePromptTemplate,
+  )
+  from langchain.schema import HumanMessage, SystemMessage
+  import os 
+
+  os.environ["OPENAI_API_KEY"] = "anything"
+
+  chat = ChatOpenAI(
+      openai_api_base="http://0.0.0.0:4000",
+      model = "gpt-3.5-turbo",
+      temperature=0.1,
+      extra_body={
+          "user": "my_customer_id"  # 👈 whatever your customer id is
+      }
+  )
+
+  messages = [
+      SystemMessage(
+          content="You are a helpful assistant that im using to make a test request to."
+      ),
+      HumanMessage(
+          content="test from litellm. tell me why it's amazing in 1 sentence"
+      ),
+  ]
+  response = chat(messages)
+
+  print(response)
+  ```
+
+  </TabItem>
+  </Tabs>
+
+</TabItem>
+<TabItem value="users" label="Internal User Billing">
+
+1. Set 'LAGO_API_CHARGE_BY' to 'user_id'
+
+```bash
+export LAGO_API_CHARGE_BY="user_id"
+```
+
+2. Create a key for that user 
+
+```bash
+curl 'http://0.0.0.0:4000/key/generate' \
+--header 'Authorization: Bearer <your-master-key>' \
+--header 'Content-Type: application/json' \
+--data-raw '{"user_id": "my-unique-id"}' # 👈 Internal User's id
+```
+
+Response Object:
+
+```bash
+{
+  "key": "sk-tXL0wt5-lOOVK9sfY2UacA",
+}
+```
+
+3. Make API Calls with that Key 
+
+```python
+import openai
+client = openai.OpenAI(
+    api_key="sk-tXL0wt5-lOOVK9sfY2UacA", # 👈 Generated key
+    base_url="http://0.0.0.0:4000"
+)
+
+# request sent to model set on litellm proxy, `litellm --model`
+response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
+    {
+        "role": "user",
+        "content": "this is a test request, write a short poem"
+    }
+])
+
+print(response)
+```
+</TabItem>
+</Tabs>
--- a/docs/my-website/docs/proxy/caching.md
+++ b/docs/my-website/docs/proxy/caching.md
@ -61,6 +61,22 @@ litellm_settings:
    ttl: 600 # will be cached on redis for 600s
 ```

+
+## SSL
+
+just set `REDIS_SSL="True"` in your .env, and LiteLLM will pick this up. 
+
+```env
+REDIS_SSL="True"
+```
+
+For quick testing, you can also use REDIS_URL, eg.:
+
+```
+REDIS_URL="rediss://.."
+```
+
+but we **don't** recommend using REDIS_URL in prod. We've noticed a performance difference between using it vs. redis_host, port, etc. 
 #### Step 2: Add Redis Credentials to .env
 Set either `REDIS_URL` or the `REDIS_HOST` in your os environment, to enable caching.

--- a/docs/my-website/docs/proxy/configs.md
+++ b/docs/my-website/docs/proxy/configs.md
@ -62,9 +62,11 @@ model_list:

 litellm_settings: # module level litellm settings - https://github.com/BerriAI/litellm/blob/main/litellm/__init__.py
  drop_params: True
+  success_callback: ["langfuse"] # OPTIONAL - if you want to start sending LLM Logs to Langfuse. Make sure to set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` in your env

 general_settings: 
  master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
+  alerting: ["slack"] # [OPTIONAL] If you want Slack Alerts for Hanging LLM requests, Slow llm responses, Budget Alerts. Make sure to set `SLACK_WEBHOOK_URL` in your env
 ```
 :::info

@ -600,6 +602,7 @@ general_settings:
  "general_settings": {
    "completion_model": "string",
    "disable_spend_logs": "boolean", # turn off writing each transaction to the db
+    "disable_master_key_return": "boolean", # turn off returning master key on UI (checked on '/user/info' endpoint)
    "disable_reset_budget": "boolean", # turn off reset budget scheduled task
    "enable_jwt_auth": "boolean", # allow proxy admin to auth in via jwt tokens with 'litellm_proxy_admin' in claims
    "enforce_user_param": "boolean", # requires all openai endpoint requests to have a 'user' param
--- a/docs/my-website/docs/proxy/cost_tracking.md
+++ b/docs/my-website/docs/proxy/cost_tracking.md
@ -1,8 +1,161 @@
-# Cost Tracking - Azure
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# 💸 Spend Tracking
+
+Track spend for keys, users, and teams across 100+ LLMs.
+
+## Getting Spend Reports - To Charge Other Teams, API Keys
+
+Use the `/global/spend/report` endpoint to get daily spend per team, with a breakdown of spend per API Key, Model
+
+### Example Request
+
+```shell
+curl -X GET 'http://localhost:4000/global/spend/report?start_date=2024-04-01&end_date=2024-06-30' \
+  -H 'Authorization: Bearer sk-1234'
+```
+
+### Example Response
+<Tabs>
+
+<TabItem value="response" label="Expected Response">
+
+```shell
+[
+    {
+        "group_by_day": "2024-04-30T00:00:00+00:00",
+        "teams": [
+            {
+                "team_name": "Prod Team",
+                "total_spend": 0.0015265,
+                "metadata": [ # see the spend by unique(key + model)
+                    {
+                        "model": "gpt-4",
+                        "spend": 0.00123,
+                        "total_tokens": 28,
+                        "api_key": "88dc28.." # the hashed api key
+                    },
+                    {
+                        "model": "gpt-4",
+                        "spend": 0.00123,
+                        "total_tokens": 28,
+                        "api_key": "a73dc2.." # the hashed api key
+                    },
+                    {
+                        "model": "chatgpt-v-2",
+                        "spend": 0.000214,
+                        "total_tokens": 122,
+                        "api_key": "898c28.." # the hashed api key
+                    },
+                    {
+                        "model": "gpt-3.5-turbo",
+                        "spend": 0.0000825,
+                        "total_tokens": 85,
+                        "api_key": "84dc28.." # the hashed api key
+                    }
+                ]
+            }
+        ]
+    }
+]
+```
+
+
+</TabItem>
+
+<TabItem value="py-script" label="Script to Parse Response (Python)">
+
+```python
+import requests
+url = 'http://localhost:4000/global/spend/report'
+params = {
+    'start_date': '2023-04-01',
+    'end_date': '2024-06-30'
+}
+
+headers = {
+    'Authorization': 'Bearer sk-1234'
+}
+
+# Make the GET request
+response = requests.get(url, headers=headers, params=params)
+spend_report = response.json()
+
+for row in spend_report:
+  date = row["group_by_day"]
+  teams = row["teams"]
+  for team in teams:
+      team_name = team["team_name"]
+      total_spend = team["total_spend"]
+      metadata = team["metadata"]
+
+      print(f"Date: {date}")
+      print(f"Team: {team_name}")
+      print(f"Total Spend: {total_spend}")
+      print("Metadata: ", metadata)
+      print()
+```
+
+Output from script
+```shell
+# Date: 2024-05-11T00:00:00+00:00
+# Team: local_test_team
+# Total Spend: 0.003675099999999999
+# Metadata:  [{'model': 'gpt-3.5-turbo', 'spend': 0.003675099999999999, 'api_key': 'b94d5e0bc3a71a573917fe1335dc0c14728c7016337451af9714924ff3a729db', 'total_tokens': 3105}]
+
+# Date: 2024-05-13T00:00:00+00:00
+# Team: Unassigned Team
+# Total Spend: 3.4e-05
+# Metadata:  [{'model': 'gpt-3.5-turbo', 'spend': 3.4e-05, 'api_key': '9569d13c9777dba68096dea49b0b03e0aaf4d2b65d4030eda9e8a2733c3cd6e0', 'total_tokens': 50}]
+
+# Date: 2024-05-13T00:00:00+00:00
+# Team: central
+# Total Spend: 0.000684
+# Metadata:  [{'model': 'gpt-3.5-turbo', 'spend': 0.000684, 'api_key': '0323facdf3af551594017b9ef162434a9b9a8ca1bbd9ccbd9d6ce173b1015605', 'total_tokens': 498}]
+
+# Date: 2024-05-13T00:00:00+00:00
+# Team: local_test_team
+# Total Spend: 0.0005715000000000001
+# Metadata:  [{'model': 'gpt-3.5-turbo', 'spend': 0.0005715000000000001, 'api_key': 'b94d5e0bc3a71a573917fe1335dc0c14728c7016337451af9714924ff3a729db', 'total_tokens': 423}]
+```
+
+
+</TabItem>
+
+</Tabs>
+
+
+## Reset Team, API Key Spend - MASTER KEY ONLY
+
+Use `/global/spend/reset` if you want to:
+- Reset the Spend for all API Keys, Teams. The `spend` for ALL Teams and Keys in `LiteLLM_TeamTable` and `LiteLLM_VerificationToken` will be set to `spend=0`
+
+- LiteLLM will maintain all the logs in `LiteLLMSpendLogs` for Auditing Purposes
+
+### Request 
+Only the `LITELLM_MASTER_KEY` you set can access this route
+```shell
+curl -X POST \
+  'http://localhost:4000/global/spend/reset' \
+  -H 'Authorization: Bearer sk-1234' \
+  -H 'Content-Type: application/json'
+```
+
+### Expected Responses
+
+```shell
+{"message":"Spend for all API Keys and Teams reset successfully","status":"success"}
+```
+
+
+
+
+## Spend Tracking for Azure

 Set base model for cost tracking azure image-gen call

-## Image Generation 
+### Image Generation 

 ```yaml
 model_list: 
@ -17,7 +170,7 @@ model_list:
        mode: image_generation
 ```

-## Chat Completions / Embeddings
+### Chat Completions / Embeddings

 **Problem**: Azure returns `gpt-4` in the response when `azure/gpt-4-1106-preview` is used. This leads to inaccurate cost tracking

--- a/docs/my-website/docs/proxy/customer_routing.md
+++ b/docs/my-website/docs/proxy/customer_routing.md
@ -0,0 +1,83 @@
+# Region-based Routing
+
+Route specific customers to eu-only models.
+
+By specifying 'allowed_model_region' for a customer, LiteLLM will filter-out any models in a model group which is not in the allowed region (i.e. 'eu').
+
+[**See Code**](https://github.com/BerriAI/litellm/blob/5eb12e30cc5faa73799ebc7e48fc86ebf449c879/litellm/router.py#L2938)
+
+### 1. Create customer with region-specification
+
+Use the litellm 'end-user' object for this. 
+
+End-users can be tracked / id'ed by passing the 'user' param to litellm in an openai chat completion/embedding call.
+
+```bash
+curl -X POST --location 'http://0.0.0.0:4000/end_user/new' \
+--header 'Authorization: Bearer sk-1234' \
+--header 'Content-Type: application/json' \
+--data '{
+    "user_id" : "ishaan-jaff-45",
+    "allowed_model_region": "eu", # 👈 SPECIFY ALLOWED REGION='eu'
+}'
+```
+
+### 2. Add eu models to model-group 
+
+Add eu models to a model group. For azure models, litellm can automatically infer the region (no need to set it). 
+
+```yaml
+model_list:
+    - model_name: gpt-3.5-turbo
+      litellm_params:
+        model: azure/gpt-35-turbo-eu # 👈 EU azure model
+        api_base: https://my-endpoint-europe-berri-992.openai.azure.com/
+        api_key: os.environ/AZURE_EUROPE_API_KEY
+    - model_name: gpt-3.5-turbo
+      litellm_params:
+        model: azure/chatgpt-v-2
+        api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
+        api_version: "2023-05-15"
+        api_key: os.environ/AZURE_API_KEY
+
+router_settings:
+  enable_pre_call_checks: true # 👈 IMPORTANT
+```
+
+Start the proxy
+
+```yaml
+litellm --config /path/to/config.yaml
+```
+
+### 3. Test it!
+
+Make a simple chat completions call to the proxy. In the response headers, you should see the returned api base. 
+
+```bash
+curl -X POST --location 'http://localhost:4000/chat/completions' \
+--header 'Content-Type: application/json' \
+--header 'Authorization: Bearer sk-1234' \
+--data '{
+    "model": "gpt-3.5-turbo", 
+    "messages": [
+        {
+        "role": "user",
+        "content": "what is the meaning of the universe? 1234"
+    }],
+    "user": "ishaan-jaff-45" # 👈 USER ID
+}
+'
+```
+
+Expected API Base in response headers 
+
+```
+x-litellm-api-base: "https://my-endpoint-europe-berri-992.openai.azure.com/"
+```
+
+### FAQ 
+
+**What happens if there are no available models for that region?**
+
+Since the router filters out models not in the specified region, it will return back as an error to the user, if no models in that region are available. 
--- a/docs/my-website/docs/proxy/deploy.md
+++ b/docs/my-website/docs/proxy/deploy.md
@ -11,40 +11,37 @@ You can find the Dockerfile to build litellm proxy [here](https://github.com/Ber

 <TabItem value="basic" label="Basic">

-**Step 1. Create a file called `litellm_config.yaml`**
+### Step 1. CREATE config.yaml 

-  Example `litellm_config.yaml` (the `os.environ/` prefix means litellm will read `AZURE_API_BASE` from the env)
-  ```yaml
-  model_list:
-    - model_name: azure-gpt-3.5
-      litellm_params:
-        model: azure/<your-azure-model-deployment>
-        api_base: os.environ/AZURE_API_BASE
-        api_key: os.environ/AZURE_API_KEY
-        api_version: "2023-07-01-preview"
-  ```
+Example `litellm_config.yaml` 

-**Step 2. Run litellm docker image**
+```yaml
+model_list:
+  - model_name: azure-gpt-3.5
+    litellm_params:
+      model: azure/<your-azure-model-deployment>
+      api_base: os.environ/AZURE_API_BASE # runs os.getenv("AZURE_API_BASE")
+      api_key: os.environ/AZURE_API_KEY # runs os.getenv("AZURE_API_KEY")
+      api_version: "2023-07-01-preview"
+```

-  See the latest available ghcr docker image here:
-  https://github.com/berriai/litellm/pkgs/container/litellm

-  Your litellm config.yaml should be called `litellm_config.yaml` in the directory you run this command. 
-  The `-v` command will mount that file

-  Pass `AZURE_API_KEY` and `AZURE_API_BASE` since we set them in step 1
+### Step 2. RUN Docker Image

-  ```shell
-  docker run \
-      -v $(pwd)/litellm_config.yaml:/app/config.yaml \
-      -e AZURE_API_KEY=d6*********** \
-      -e AZURE_API_BASE=https://openai-***********/ \
-      -p 4000:4000 \
-      ghcr.io/berriai/litellm:main-latest \
-      --config /app/config.yaml --detailed_debug
-  ```
+```shell
+docker run \
+    -v $(pwd)/litellm_config.yaml:/app/config.yaml \
+    -e AZURE_API_KEY=d6*********** \
+    -e AZURE_API_BASE=https://openai-***********/ \
+    -p 4000:4000 \
+    ghcr.io/berriai/litellm:main-latest \
+    --config /app/config.yaml --detailed_debug
+```

-**Step 3. Send a Test Request**
+Get Latest Image 👉 [here](https://github.com/berriai/litellm/pkgs/container/litellm)
+
+### Step 3. TEST Request

  Pass `model=azure-gpt-3.5` this was set on step 1

@ -231,13 +228,16 @@ Your OpenAI proxy server is now running on `http://127.0.0.1:4000`.
 | Docs | When to Use |
 | --- | --- |
 | [Quick Start](#quick-start) | call 100+ LLMs + Load Balancing |
-| [Deploy with Database](#deploy-with-database) | + use Virtual Keys + Track Spend |
+| [Deploy with Database](#deploy-with-database) | + use Virtual Keys + Track Spend (Note: When deploying with a database providing a `DATABASE_URL` and `LITELLM_MASTER_KEY` are required in your env ) |
 | [LiteLLM container + Redis](#litellm-container--redis) | + load balance across multiple litellm containers |
 | [LiteLLM Database container + PostgresDB + Redis](#litellm-database-container--postgresdb--redis) | + use Virtual Keys + Track Spend + load balance across multiple litellm containers |

 ## Deploy with Database
 ### Docker, Kubernetes, Helm Chart

+Requirements:
+- Need a postgres database (e.g. [Supabase](https://supabase.com/), [Neon](https://neon.tech/), etc) Set `DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname>` in your env 
+- Set a `LITELLM_MASTER_KEY`, this is your Proxy Admin key - you can use this to create other keys (🚨 must start with `sk-`)

 <Tabs>

@ -252,6 +252,8 @@ docker pull ghcr.io/berriai/litellm-database:main-latest
 ```shell
 docker run \
    -v $(pwd)/litellm_config.yaml:/app/config.yaml \
+    -e LITELLM_MASTER_KEY=sk-1234 \
+    -e DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname> \
    -e AZURE_API_KEY=d6*********** \
    -e AZURE_API_BASE=https://openai-***********/ \
    -p 4000:4000 \
@ -267,26 +269,63 @@ Your OpenAI proxy server is now running on `http://0.0.0.0:4000`.
 #### Step 1. Create deployment.yaml

 ```yaml
-   apiVersion: apps/v1
-   kind: Deployment
-   metadata:
-     name: litellm-deployment
-   spec:
-     replicas: 1
-     selector:
-       matchLabels:
-         app: litellm
-     template:
-       metadata:
-         labels:
-           app: litellm
-       spec:
-         containers:
-           - name: litellm-container
-             image: ghcr.io/berriai/litellm-database:main-latest
-             env:
-              - name: DATABASE_URL
-                value: postgresql://<user>:<password>@<host>:<port>/<dbname>
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: litellm-deployment
+spec:
+  replicas: 3
+  selector:
+    matchLabels:
+      app: litellm
+  template:
+    metadata:
+      labels:
+        app: litellm
+    spec:
+      containers:
+        - name: litellm-container
+          image: ghcr.io/berriai/litellm:main-latest
+          imagePullPolicy: Always
+          env:
+            - name: AZURE_API_KEY
+              value: "d6******"
+            - name: AZURE_API_BASE
+              value: "https://ope******"
+            - name: LITELLM_MASTER_KEY
+              value: "sk-1234"
+            - name: DATABASE_URL
+              value: "po**********"
+          args:
+            - "--config"
+            - "/app/proxy_config.yaml"  # Update the path to mount the config file
+          volumeMounts:                 # Define volume mount for proxy_config.yaml
+            - name: config-volume
+              mountPath: /app
+              readOnly: true
+          livenessProbe:
+            httpGet:
+              path: /health/liveliness
+              port: 4000
+            initialDelaySeconds: 120
+            periodSeconds: 15
+            successThreshold: 1
+            failureThreshold: 3
+            timeoutSeconds: 10
+          readinessProbe:
+            httpGet:
+              path: /health/readiness
+              port: 4000
+            initialDelaySeconds: 120
+            periodSeconds: 15
+            successThreshold: 1
+            failureThreshold: 3
+            timeoutSeconds: 10
+      volumes:  # Define volume to mount proxy_config.yaml
+        - name: config-volume
+          configMap:
+            name: litellm-config  
+
 ```

 ```bash
--- a/docs/my-website/docs/proxy/logging.md
+++ b/docs/my-website/docs/proxy/logging.md
@ -3,19 +3,21 @@ import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';


-# 🔎 Logging - Custom Callbacks, DataDog, Langfuse, s3 Bucket, Sentry, OpenTelemetry, Athina
+# 🔎 Logging - Custom Callbacks, DataDog, Langfuse, s3 Bucket, Sentry, OpenTelemetry, Athina, Azure Content-Safety

 Log Proxy Input, Output, Exceptions using Custom Callbacks, Langfuse, OpenTelemetry, LangFuse, DynamoDB, s3 Bucket

 - [Async Custom Callbacks](#custom-callback-class-async)
 - [Async Custom Callback APIs](#custom-callback-apis-async)
- [Logging to DataDog](#logging-proxy-inputoutput---datadog)
 - [Logging to Langfuse](#logging-proxy-inputoutput---langfuse)
+- [Logging to OpenMeter](#logging-proxy-inputoutput---langfuse)
 - [Logging to s3 Buckets](#logging-proxy-inputoutput---s3-buckets)
+- [Logging to DataDog](#logging-proxy-inputoutput---datadog)
 - [Logging to DynamoDB](#logging-proxy-inputoutput---dynamodb)
 - [Logging to Sentry](#logging-proxy-inputoutput---sentry)
 - [Logging to Traceloop (OpenTelemetry)](#logging-proxy-inputoutput-traceloop-opentelemetry)
 - [Logging to Athina](#logging-proxy-inputoutput-athina)
+- [(BETA) Moderation with Azure Content-Safety](#moderation-with-azure-content-safety)

 ## Custom Callback Class [Async]
 Use this when you want to run custom callbacks in `python`
@ -401,7 +403,7 @@ litellm_settings:
 Start the LiteLLM Proxy and make a test request to verify the logs reached your callback API 

 ## Logging Proxy Input/Output - Langfuse
-We will use the `--config` to set `litellm.success_callback = ["langfuse"]` this will log all successfull LLM calls to langfuse
+We will use the `--config` to set `litellm.success_callback = ["langfuse"]` this will log all successfull LLM calls to langfuse. Make sure to set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` in your environment

 **Step 1** Install langfuse

@ -419,7 +421,13 @@ litellm_settings:
  success_callback: ["langfuse"]
 ```

-**Step 3**: Start the proxy, make a test request
+**Step 3**: Set required env variables for logging to langfuse
+```shell
+export LANGFUSE_PUBLIC_KEY="pk_kk"
+export LANGFUSE_SECRET_KEY="sk_ss
+```
+
+**Step 4**: Start the proxy, make a test request

 Start proxy
 ```shell
@ -539,6 +547,105 @@ print(response)
 </Tabs>


+### Team based Logging to Langfuse
+
+**Example:**
+
+This config would send langfuse logs to 2 different langfuse projects, based on the team id 
+
+```yaml
+litellm_settings:
+  default_team_settings: 
+    - team_id: my-secret-project
+      success_callback: ["langfuse"]
+      langfuse_public_key: os.environ/LANGFUSE_PUB_KEY_1 # Project 1
+      langfuse_secret: os.environ/LANGFUSE_PRIVATE_KEY_1 # Project 1
+    - team_id: ishaans-secret-project
+      success_callback: ["langfuse"]
+      langfuse_public_key: os.environ/LANGFUSE_PUB_KEY_2 # Project 2
+      langfuse_secret: os.environ/LANGFUSE_SECRET_2 # Project 2
+```
+
+Now, when you [generate keys](./virtual_keys.md) for this team-id 
+
+```bash
+curl -X POST 'http://0.0.0.0:4000/key/generate' \
+-H 'Authorization: Bearer sk-1234' \
+-H 'Content-Type: application/json' \
+-d '{"team_id": "ishaans-secret-project"}'
+```
+
+All requests made with these keys will log data to their team-specific logging.
+
+### Redacting Messages, Response Content from Langfuse Logging 
+
+Set `litellm.turn_off_message_logging=True` This will prevent the messages and responses from being logged to langfuse, but request metadata will still be logged.
+
+```yaml
+model_list:
+ - model_name: gpt-3.5-turbo
+    litellm_params:
+      model: gpt-3.5-turbo
+litellm_settings:
+  success_callback: ["langfuse"]
+  turn_off_message_logging: True
+```
+
+
+
+## Logging Proxy Cost + Usage - OpenMeter
+
+Bill customers according to their LLM API usage with [OpenMeter](../observability/openmeter.md)
+
+**Required Env Variables**
+
+```bash
+# from https://openmeter.cloud
+export OPENMETER_API_ENDPOINT="" # defaults to https://openmeter.cloud
+export OPENMETER_API_KEY=""
+```
+
+### Quick Start 
+
+1. Add to Config.yaml
+```yaml
+model_list:
+- litellm_params:
+    api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
+    api_key: my-fake-key
+    model: openai/my-fake-model
+  model_name: fake-openai-endpoint
+
+litellm_settings:
+  success_callback: ["openmeter"] # 👈 KEY CHANGE
+```
+
+2. Start Proxy
+
+```
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```bash
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+--header 'Content-Type: application/json' \
+--data ' {
+      "model": "fake-openai-endpoint",
+      "messages": [
+        {
+          "role": "user",
+          "content": "what llm are you"
+        }
+      ],
+    }
+'
+```
+
+
+<Image img={require('../../img/openmeter_img_2.png')} />
+
 ## Logging Proxy Input/Output - DataDog
 We will use the `--config` to set `litellm.success_callback = ["datadog"]` this will log all successfull LLM calls to DataDog

@ -808,39 +915,72 @@ Test Request
 litellm --test
 ```

-## Logging Proxy Input/Output Traceloop (OpenTelemetry)
+## Logging Proxy Input/Output in OpenTelemetry format using Traceloop's OpenLLMetry

-Traceloop allows you to log LLM Input/Output in the OpenTelemetry format
+[OpenLLMetry](https://github.com/traceloop/openllmetry) _(built and maintained by Traceloop)_ is a set of extensions
+built on top of [OpenTelemetry](https://opentelemetry.io/) that gives you complete observability over your LLM
+application. Because it uses OpenTelemetry under the
+hood, [it can be connected to various observability solutions](https://www.traceloop.com/docs/openllmetry/integrations/introduction)
+like:

-We will use the `--config` to set `litellm.success_callback = ["traceloop"]` this will log all successfull LLM calls to traceloop
+* [Traceloop](https://www.traceloop.com/docs/openllmetry/integrations/traceloop)
+* [Axiom](https://www.traceloop.com/docs/openllmetry/integrations/axiom)
+* [Azure Application Insights](https://www.traceloop.com/docs/openllmetry/integrations/azure)
+* [Datadog](https://www.traceloop.com/docs/openllmetry/integrations/datadog)
+* [Dynatrace](https://www.traceloop.com/docs/openllmetry/integrations/dynatrace)
+* [Grafana Tempo](https://www.traceloop.com/docs/openllmetry/integrations/grafana)
+* [Honeycomb](https://www.traceloop.com/docs/openllmetry/integrations/honeycomb)
+* [HyperDX](https://www.traceloop.com/docs/openllmetry/integrations/hyperdx)
+* [Instana](https://www.traceloop.com/docs/openllmetry/integrations/instana)
+* [New Relic](https://www.traceloop.com/docs/openllmetry/integrations/newrelic)
+* [OpenTelemetry Collector](https://www.traceloop.com/docs/openllmetry/integrations/otel-collector)
+* [Service Now Cloud Observability](https://www.traceloop.com/docs/openllmetry/integrations/service-now)
+* [Sentry](https://www.traceloop.com/docs/openllmetry/integrations/sentry)
+* [SigNoz](https://www.traceloop.com/docs/openllmetry/integrations/signoz)
+* [Splunk](https://www.traceloop.com/docs/openllmetry/integrations/splunk)

-**Step 1** Install traceloop-sdk and set Traceloop API key
+We will use the `--config` to set `litellm.success_callback = ["traceloop"]` to achieve this, steps are listed below.
+
+**Step 1:** Install the SDK

 ```shell
-pip install traceloop-sdk -U
+pip install traceloop-sdk
 ```

-Traceloop outputs standard OpenTelemetry data that can be connected to your observability stack. Send standard OpenTelemetry from LiteLLM Proxy to [Traceloop](https://www.traceloop.com/docs/openllmetry/integrations/traceloop), [Dynatrace](https://www.traceloop.com/docs/openllmetry/integrations/dynatrace), [Datadog](https://www.traceloop.com/docs/openllmetry/integrations/datadog)
-, [New Relic](https://www.traceloop.com/docs/openllmetry/integrations/newrelic), [Honeycomb](https://www.traceloop.com/docs/openllmetry/integrations/honeycomb), [Grafana Tempo](https://www.traceloop.com/docs/openllmetry/integrations/grafana), [Splunk](https://www.traceloop.com/docs/openllmetry/integrations/splunk), [OpenTelemetry Collector](https://www.traceloop.com/docs/openllmetry/integrations/otel-collector)
+**Step 2:** Configure Environment Variable for trace exporting
+
+You will need to configure where to export your traces. Environment variables will control this, example: For Traceloop
+you should use `TRACELOOP_API_KEY`, whereas for Datadog you use `TRACELOOP_BASE_URL`. For more
+visit [the Integrations Catalog](https://www.traceloop.com/docs/openllmetry/integrations/introduction).
+
+If you are using Datadog as the observability solutions then you can set `TRACELOOP_BASE_URL` as:
+
+```shell
+TRACELOOP_BASE_URL=http://<datadog-agent-hostname>:4318
+```
+
+**Step 3**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`

-**Step 2**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
 ```yaml
 model_list:
- - model_name: gpt-3.5-turbo
+  - model_name: gpt-3.5-turbo
    litellm_params:
      model: gpt-3.5-turbo
+      api_key: my-fake-key # replace api_key with actual key
 litellm_settings:
-  success_callback: ["traceloop"]
+  success_callback: [ "traceloop" ]
 ```

-**Step 3**: Start the proxy, make a test request
+**Step 4**: Start the proxy, make a test request

 Start proxy
+
 ```shell
 litellm --config config.yaml --debug
 ```

 Test Request
+
 ```
 curl --location 'http://0.0.0.0:4000/chat/completions' \
    --header 'Content-Type: application/json' \
@ -898,3 +1038,86 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
    ]
    }'
 ```
+
+## (BETA) Moderation with Azure Content Safety
+
+[Azure Content-Safety](https://azure.microsoft.com/en-us/products/ai-services/ai-content-safety) is a Microsoft Azure service that provides content moderation APIs to detect potential offensive, harmful, or risky content in text.
+
+We will use the `--config` to set `litellm.success_callback = ["azure_content_safety"]` this will moderate all LLM calls using Azure Content Safety.
+
+**Step 0** Deploy Azure Content Safety
+
+Deploy an Azure Content-Safety instance from the Azure Portal and get the `endpoint` and `key`.
+
+**Step 1** Set Athina API key
+
+```shell
+AZURE_CONTENT_SAFETY_KEY = "<your-azure-content-safety-key>"
+```
+
+**Step 2**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
+```yaml
+model_list:
+  - model_name: gpt-3.5-turbo
+    litellm_params:
+      model: gpt-3.5-turbo
+litellm_settings:
+  callbacks: ["azure_content_safety"]
+  azure_content_safety_params:
+    endpoint: "<your-azure-content-safety-endpoint>"
+    key: "os.environ/AZURE_CONTENT_SAFETY_KEY"
+```
+
+**Step 3**: Start the proxy, make a test request
+
+Start proxy
+```shell
+litellm --config config.yaml --debug
+```
+
+Test Request
+```
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+    --header 'Content-Type: application/json' \
+    --data ' {
+        "model": "gpt-3.5-turbo",
+        "messages": [
+            {
+                "role": "user",
+                "content": "Hi, how are you?"
+            }
+        ]
+    }'
+```
+
+An HTTP 400 error will be returned if the content is detected with a value greater than the threshold set in the `config.yaml`.
+The details of the response will describe :
+- The `source` : input text or llm generated text
+- The `category` : the category of the content that triggered the moderation
+- The `severity` : the severity from 0 to 10
+
+**Step 4**: Customizing Azure Content Safety Thresholds
+
+You can customize the thresholds for each category by setting the `thresholds` in the `config.yaml`
+
+```yaml
+model_list:
+  - model_name: gpt-3.5-turbo
+    litellm_params:
+      model: gpt-3.5-turbo
+litellm_settings:
+  callbacks: ["azure_content_safety"]
+  azure_content_safety_params:
+    endpoint: "<your-azure-content-safety-endpoint>"
+    key: "os.environ/AZURE_CONTENT_SAFETY_KEY"
+    thresholds:
+      Hate: 6
+      SelfHarm: 8
+      Sexual: 6
+      Violence: 4
+```
+
+:::info
+`thresholds` are not required by default, but you can tune the values to your needs.
+Default values is `4` for all categories
+:::
--- a/docs/my-website/docs/proxy/prod.md
+++ b/docs/my-website/docs/proxy/prod.md
@ -3,7 +3,75 @@ import TabItem from '@theme/TabItem';

 # ⚡ Best Practices for Production

-Expected Performance in Production
+## 1. Use this config.yaml
+Use this config.yaml in production (with your own LLMs)
+
+```yaml
+model_list:
+  - model_name: fake-openai-endpoint
+    litellm_params:
+      model: openai/fake
+      api_key: fake-key
+      api_base: https://exampleopenaiendpoint-production.up.railway.app/
+
+general_settings:
+  master_key: sk-1234      # enter your own master key, ensure it starts with 'sk-'
+  alerting: ["slack"]      # Setup slack alerting - get alerts on LLM exceptions, Budget Alerts, Slow LLM Responses
+  proxy_batch_write_at: 60 # Batch write spend updates every 60s
+
+litellm_settings:
+  set_verbose: False      # Switch off Debug Logging, ensure your logs do not have any debugging on
+```
+
+Set slack webhook url in your env
+```shell
+export SLACK_WEBHOOK_URL="https://hooks.slack.com/services/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"
+```
+
+:::info
+
+Need Help or want dedicated support ? Talk to a founder [here]: (https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
+
+:::
+
+
+## 2. On Kubernetes - Use 1 Uvicorn worker [Suggested CMD]
+
+Use this Docker `CMD`. This will start the proxy with 1 Uvicorn Async Worker
+
+(Ensure that you're not setting `run_gunicorn` or `num_workers` in the CMD). 
+```shell
+CMD ["--port", "4000", "--config", "./proxy_server_config.yaml"]
+```
+
+
+## 3. Use Redis 'port','host', 'password'. NOT 'redis_url'
+
+If you decide to use Redis, DO NOT use 'redis_url'. We recommend usig redis port, host, and password params. 
+
+`redis_url`is 80 RPS slower
+
+This is still something we're investigating. Keep track of it [here](https://github.com/BerriAI/litellm/issues/3188)
+
+Recommended to do this for prod: 
+
+```yaml
+router_settings:
+  routing_strategy: usage-based-routing-v2 
+  # redis_url: "os.environ/REDIS_URL"
+  redis_host: os.environ/REDIS_HOST
+  redis_port: os.environ/REDIS_PORT
+  redis_password: os.environ/REDIS_PASSWORD
+```
+
+## 4. Disable 'load_dotenv'
+
+Set `export LITELLM_MODE="PRODUCTION"`
+
+This disables the load_dotenv() functionality, which will automatically load your environment credentials from the local `.env`. 
+
+## Extras
+### Expected Performance in Production

 1 LiteLLM Uvicorn Worker on Kubernetes

@ -16,13 +84,7 @@ Expected Performance in Production
 | `/chat/completions` Requests/hour | `126K` |


-## 1. Switch of Debug Logging
-
-Remove `set_verbose: True` from your config.yaml
-```yaml
-litellm_settings:
-  set_verbose: True
-```
+### Verifying Debugging logs are off

 You should only see the following level of details in logs on the proxy server
 ```shell
@ -31,135 +93,8 @@ You should only see the following level of details in logs on the proxy server
 # INFO:     192.168.2.205:29734 - "POST /chat/completions HTTP/1.1" 200 OK
 ```

-## 2. On Kubernetes - Use 1 Uvicorn worker [Suggested CMD]

-Use this Docker `CMD`. This will start the proxy with 1 Uvicorn Async Worker
-
-(Ensure that you're not setting `run_gunicorn` or `num_workers` in the CMD). 
-```shell
-CMD ["--port", "4000", "--config", "./proxy_server_config.yaml"]
-```
-
-## 2. Batch write spend updates every 60s
-
-The default proxy batch write is 10s. This is to make it easy to see spend when debugging locally. 
-
-In production, we recommend using a longer interval period of 60s. This reduces the number of connections used to make DB writes. 
-
-```yaml
-general_settings:
-  master_key: sk-1234
-  proxy_batch_write_at: 5 # 👈 Frequency of batch writing logs to server (in seconds)
-```
-
-
-## 3. Move spend logs to separate server
-
-Writing each spend log to the db can slow down your proxy. In testing we saw a 70% improvement in median response time, by moving writing spend logs to a separate server. 
-
-👉 [LiteLLM Spend Logs Server](https://github.com/BerriAI/litellm/tree/main/litellm-js/spend-logs)
-
-
-**Spend Logs**  
-This is a log of the key, tokens, model, and latency for each call on the proxy. 
-
-[**Full Payload**](https://github.com/BerriAI/litellm/blob/8c9623a6bc4ad9da0a2dac64249a60ed8da719e8/litellm/proxy/utils.py#L1769)
-
-
-**1. Start the spend logs server**
-
-```bash
-docker run -p 3000:3000 \
-  -e DATABASE_URL="postgres://.." \
-  ghcr.io/berriai/litellm-spend_logs:main-latest
-
-# RUNNING on http://0.0.0.0:3000
-```
-
-**2. Connect to proxy**
-
-
-Example litellm_config.yaml
-
-```yaml
-model_list:
- model_name: fake-openai-endpoint
-  litellm_params:
-    model: openai/my-fake-model
-    api_key: my-fake-key
-    api_base: https://exampleopenaiendpoint-production.up.railway.app/
-
-general_settings:
-  master_key: sk-1234
-  proxy_batch_write_at: 5 # 👈 Frequency of batch writing logs to server (in seconds)
-```
-
-Add `SPEND_LOGS_URL` as an environment variable when starting the proxy 
-
-```bash
-docker run \
-    -v $(pwd)/litellm_config.yaml:/app/config.yaml \
-    -e DATABASE_URL="postgresql://.." \
-    -e SPEND_LOGS_URL="http://host.docker.internal:3000" \ # 👈 KEY CHANGE
-    -p 4000:4000 \
-    ghcr.io/berriai/litellm:main-latest \
-    --config /app/config.yaml --detailed_debug
-
-# Running on http://0.0.0.0:4000
-```
-
-**3. Test Proxy!**
-
-
-```bash
-curl --location 'http://0.0.0.0:4000/v1/chat/completions' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer sk-1234' \
--data '{
-    "model": "fake-openai-endpoint", 
-    "messages": [
-        {"role": "system", "content": "Be helpful"},
-        {"role": "user", "content": "What do you know?"}
-    ]
-}'
-```
-
-In your LiteLLM Spend Logs Server, you should see
-
-**Expected Response**
-
-```
-Received and stored 1 logs. Total logs in memory: 1
-...
-Flushed 1 log to the DB.
-```
-
-
-### Machine Specification
-
-A t2.micro should be sufficient to handle 1k logs / minute on this server. 
-
-This consumes at max 120MB, and <0.1 vCPU. 
-
-## 4. Switch off resetting budgets
-
-Add this to your config.yaml. (Only spend per Key, User and Team will be tracked - spend per API Call will not be written to the LiteLLM Database)
-```yaml
-general_settings:
-  disable_spend_logs: true
-  disable_reset_budget: true
-```
-
-## 5. Switch of `litellm.telemetry`
-
-Switch of all telemetry tracking done by litellm
-
-```yaml
-litellm_settings:
-  telemetry: False
-```
-
-## Machine Specifications to Deploy LiteLLM
+### Machine Specifications to Deploy LiteLLM

 | Service | Spec | CPUs | Memory | Architecture | Version|
 | --- | --- | --- | --- | --- | --- | 
@ -167,7 +102,7 @@ litellm_settings:
 | Redis Cache | - | - | - | - | 7.0+ Redis Engine|


-## Reference Kubernetes Deployment YAML
+### Reference Kubernetes Deployment YAML

 Reference Kubernetes `deployment.yaml` that was load tested by us

--- a/docs/my-website/docs/proxy/grafana_metrics.md
+++ b/docs/my-website/docs/proxy/grafana_metrics.md
@ -14,6 +14,7 @@ model_list:
      model: gpt-3.5-turbo
 litellm_settings:
  success_callback: ["prometheus"]
+  failure_callback: ["prometheus"]
 ```

 Start the proxy
@ -48,6 +49,26 @@ http://localhost:4000/metrics

 | Metric Name          | Description                          |
 |----------------------|--------------------------------------|
-| `litellm_requests_metric`             | Number of requests made, per `"user", "key", "model"`          |
-| `litellm_spend_metric`                | Total Spend, per `"user", "key", "model"`                 |
-| `litellm_total_tokens`         | input + output tokens per `"user", "key", "model"`     |
+| `litellm_requests_metric`             | Number of requests made, per `"user", "key", "model", "team", "end-user"`          |
+| `litellm_spend_metric`                | Total Spend, per `"user", "key", "model", "team", "end-user"`                 |
+| `litellm_total_tokens`         | input + output tokens per `"user", "key", "model", "team", "end-user"`     |
+| `litellm_llm_api_failed_requests_metric`   | Number of failed LLM API requests per `"user", "key", "model", "team", "end-user"`    |
+
+## Monitor System Health
+
+To monitor the health of litellm adjacent services (redis / postgres), do:
+
+```yaml
+model_list:
+ - model_name: gpt-3.5-turbo
+    litellm_params:
+      model: gpt-3.5-turbo
+litellm_settings:
+  service_callback: ["prometheus_system"]
+```
+
+| Metric Name          | Description                          |
+|----------------------|--------------------------------------|
+| `litellm_redis_latency`         | histogram latency for redis calls     |
+| `litellm_redis_fails`         | Number of failed redis calls    |
+| `litellm_self_latency`         | Histogram latency for successful litellm api call    |
--- a/docs/my-website/docs/proxy/quick_start.md
+++ b/docs/my-website/docs/proxy/quick_start.md
@ -348,6 +348,29 @@ query_result = embeddings.embed_query(text)

 print(f"TITAN EMBEDDINGS")
 print(query_result[:5])
+```
+</TabItem>
+<TabItem value="litellm" label="LiteLLM SDK">
+
+This is **not recommended**. There is duplicate logic as the proxy also uses the sdk, which might lead to unexpected errors. 
+
+```python
+from litellm import completion 
+
+response = completion(
+    model="openai/gpt-3.5-turbo", 
+    messages = [
+        {
+            "role": "user",
+            "content": "this is a test request, write a short poem"
+        }
+    ], 
+    api_key="anything", 
+    base_url="http://0.0.0.0:4000"
+    )
+
+print(response)
+
 ```
 </TabItem>
 </Tabs>
--- a/docs/my-website/docs/proxy/reliability.md
+++ b/docs/my-website/docs/proxy/reliability.md
@ -136,7 +136,22 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
 '
 ```

-## Advanced - Context Window Fallbacks 
+### Test it!
+
+
+```bash
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+     --header 'Content-Type: application/json' \
+     --data-raw '{
+        "model": "zephyr-beta", # 👈 MODEL NAME to fallback from
+        "messages": [
+            {"role": "user", "content": "what color is red"}
+        ],
+        "mock_testing_fallbacks": true
+     }'
+```
+
+## Advanced - Context Window Fallbacks (Pre-Call Checks + Fallbacks)

 **Before call is made** check if a call is within model context window with  **`enable_pre_call_checks: true`**.

@ -217,16 +232,16 @@ model_list:
 	- model_name: gpt-3.5-turbo-small
 	  litellm_params:
 		model: azure/chatgpt-v-2
-		api_base: os.environ/AZURE_API_BASE
-		api_key: os.environ/AZURE_API_KEY
-		api_version: "2023-07-01-preview"
-	  model_info:
-		base_model: azure/gpt-4-1106-preview # 2. 👈 (azure-only) SET BASE MODEL
+      api_base: os.environ/AZURE_API_BASE
+      api_key: os.environ/AZURE_API_KEY
+      api_version: "2023-07-01-preview"
+      model_info:
+      base_model: azure/gpt-4-1106-preview # 2. 👈 (azure-only) SET BASE MODEL
 	
 	- model_name: gpt-3.5-turbo-large
 	  litellm_params:
-		model: gpt-3.5-turbo-1106
-		api_key: os.environ/OPENAI_API_KEY
+      model: gpt-3.5-turbo-1106
+      api_key: os.environ/OPENAI_API_KEY

  - model_name: claude-opus
    litellm_params:
@ -272,6 +287,69 @@ print(response)
 </Tabs>


+## Advanced - EU-Region Filtering (Pre-Call Checks)
+
+**Before call is made** check if a call is within model context window with  **`enable_pre_call_checks: true`**.
+
+Set 'region_name' of deployment. 
+
+**Note:** LiteLLM can automatically infer region_name for Vertex AI, Bedrock, and IBM WatsonxAI based on your litellm params. For Azure, set `litellm.enable_preview = True`.
+
+**1. Set Config**
+
+```yaml
+router_settings:
+	enable_pre_call_checks: true # 1. Enable pre-call checks
+
+model_list:
+- model_name: gpt-3.5-turbo
+  litellm_params:
+    model: azure/chatgpt-v-2
+    api_base: os.environ/AZURE_API_BASE
+    api_key: os.environ/AZURE_API_KEY
+    api_version: "2023-07-01-preview"
+    region_name: "eu" # 👈 SET EU-REGION
+
+- model_name: gpt-3.5-turbo
+  litellm_params:
+    model: gpt-3.5-turbo-1106
+    api_key: os.environ/OPENAI_API_KEY
+
+- model_name: gemini-pro
+  litellm_params:
+    model: vertex_ai/gemini-pro-1.5
+    vertex_project: adroit-crow-1234
+    vertex_location: us-east1 # 👈 AUTOMATICALLY INFERS 'region_name'
+```
+
+**2. Start proxy**
+
+```bash
+litellm --config /path/to/config.yaml
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+**3. Test it!**
+
+```python
+import openai
+client = openai.OpenAI(
+    api_key="anything",
+    base_url="http://0.0.0.0:4000"
+)
+
+# request sent to model set on litellm proxy, `litellm --model`
+response = client.chat.completions.with_raw_response.create(
+    model="gpt-3.5-turbo",
+    messages = [{"role": "user", "content": "Who was Alexander?"}]
+)
+
+print(response)
+
+print(f"response.headers.get('x-litellm-model-api-base')")
+```
+
 ## Advanced - Custom Timeouts, Stream Timeouts - Per Model
 For each model you can set `timeout` & `stream_timeout` under `litellm_params`
 ```yaml
--- a/docs/my-website/docs/proxy/token_auth.md
+++ b/docs/my-website/docs/proxy/token_auth.md
@ -17,6 +17,7 @@ This is a new feature, and subject to changes based on feedback.
 ### Step 1. Setup Proxy

 - `JWT_PUBLIC_KEY_URL`: This is the public keys endpoint of your OpenID provider. Typically it's `{openid-provider-base-url}/.well-known/openid-configuration/jwks`. For Keycloak it's `{keycloak_base_url}/realms/{your-realm}/protocol/openid-connect/certs`.
+- `JWT_AUDIENCE`: This is the audience used for decoding the JWT. If not set, the decode step will not verify the audience. 

 ```bash
 export JWT_PUBLIC_KEY_URL="" # "https://demo.duendesoftware.com/.well-known/openid-configuration/jwks"
@ -109,7 +110,7 @@ general_settings:
    admin_jwt_scope: "litellm-proxy-admin"
 ```

-## Advanced - Spend Tracking (User / Team / Org)
+## Advanced - Spend Tracking (End-Users / Internal Users / Team / Org)

 Set the field in the jwt token, which corresponds to a litellm user / team / org.

@ -122,6 +123,7 @@ general_settings:
    team_id_jwt_field: "client_id" # 👈 CAN BE ANY FIELD
    user_id_jwt_field: "sub" # 👈 CAN BE ANY FIELD
    org_id_jwt_field: "org_id" # 👈 CAN BE ANY FIELD
+    end_user_id_jwt_field: "customer_id" # 👈 CAN BE ANY FIELD
 ```

 Expected JWT: 
@ -130,7 +132,7 @@ Expected JWT:
 {
  "client_id": "my-unique-team",
  "sub": "my-unique-user",
-  "org_id": "my-unique-org"
+  "org_id": "my-unique-org",
 }
 ```

--- a/docs/my-website/docs/proxy/user_keys.md
+++ b/docs/my-website/docs/proxy/user_keys.md
@ -121,6 +121,9 @@ from langchain.prompts.chat import (
    SystemMessagePromptTemplate,
 )
 from langchain.schema import HumanMessage, SystemMessage
+import os 
+
+os.environ["OPENAI_API_KEY"] = "anything"

 chat = ChatOpenAI(
    openai_api_base="http://0.0.0.0:4000",
@ -362,6 +365,188 @@ curl --location 'http://0.0.0.0:4000/moderations' \

 ## Advanced

+### (BETA) Batch Completions - pass multiple models
+
+Use this when you want to send 1 request to N Models
+
+#### Expected Request Format
+
+Pass model as a string of comma separated value of models. Example `"model"="llama3,gpt-3.5-turbo"`
+
+This same request will be sent to the following model groups on the [litellm proxy config.yaml](https://docs.litellm.ai/docs/proxy/configs)
+- `model_name="llama3"`
+- `model_name="gpt-3.5-turbo"` 
+
+<Tabs>
+
+<TabItem value="openai-py" label="OpenAI Python SDK">
+
+
+```python
+import openai
+
+client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000")
+
+response = client.chat.completions.create(
+    model="gpt-3.5-turbo,llama3",
+    messages=[
+        {"role": "user", "content": "this is a test request, write a short poem"}
+    ],
+)
+
+print(response)
+```
+
+
+
+#### Expected Response Format
+
+Get a list of responses when `model` is passed as a list
+
+```python
+[
+    ChatCompletion(
+        id='chatcmpl-9NoYhS2G0fswot0b6QpoQgmRQMaIf',
+        choices=[
+            Choice(
+                finish_reason='stop',
+                index=0,
+                logprobs=None,
+                message=ChatCompletionMessage(
+                    content='In the depths of my soul, a spark ignites\nA light that shines so pure and bright\nIt dances and leaps, refusing to die\nA flame of hope that reaches the sky\n\nIt warms my heart and fills me with bliss\nA reminder that in darkness, there is light to kiss\nSo I hold onto this fire, this guiding light\nAnd let it lead me through the darkest night.',
+                    role='assistant',
+                    function_call=None,
+                    tool_calls=None
+                )
+            )
+        ],
+        created=1715462919,
+        model='gpt-3.5-turbo-0125',
+        object='chat.completion',
+        system_fingerprint=None,
+        usage=CompletionUsage(
+            completion_tokens=83,
+            prompt_tokens=17,
+            total_tokens=100
+        )
+    ),
+    ChatCompletion(
+        id='chatcmpl-4ac3e982-da4e-486d-bddb-ed1d5cb9c03c',
+        choices=[
+            Choice(
+                finish_reason='stop',
+                index=0,
+                logprobs=None,
+                message=ChatCompletionMessage(
+                    content="A test request, and I'm delighted!\nHere's a short poem, just for you:\n\nMoonbeams dance upon the sea,\nA path of light, for you to see.\nThe stars up high, a twinkling show,\nA night of wonder, for all to know.\n\nThe world is quiet, save the night,\nA peaceful hush, a gentle light.\nThe world is full, of beauty rare,\nA treasure trove, beyond compare.\n\nI hope you enjoyed this little test,\nA poem born, of whimsy and jest.\nLet me know, if there's anything else!",
+                    role='assistant',
+                    function_call=None,
+                    tool_calls=None
+                )
+            )
+        ],
+        created=1715462919,
+        model='groq/llama3-8b-8192',
+        object='chat.completion',
+        system_fingerprint='fp_a2c8d063cb',
+        usage=CompletionUsage(
+            completion_tokens=120,
+            prompt_tokens=20,
+            total_tokens=140
+        )
+    )
+]
+```
+
+
+</TabItem>
+
+<TabItem value="curl" label="Curl">
+
+
+
+
+```shell
+curl --location 'http://localhost:4000/chat/completions' \
+    --header 'Authorization: Bearer sk-1234' \
+    --header 'Content-Type: application/json' \
+    --data '{
+    "model": "llama3,gpt-3.5-turbo",
+    "max_tokens": 10,
+    "user": "litellm2",
+    "messages": [
+        {
+        "role": "user",
+        "content": "is litellm getting better"
+        }
+    ]
+}'
+```
+
+
+
+
+#### Expected Response Format
+
+Get a list of responses when `model` is passed as a list
+
+```json
+[
+  {
+    "id": "chatcmpl-3dbd5dd8-7c82-4ca3-bf1f-7c26f497cf2b",
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "message": {
+          "content": "The Elder Scrolls IV: Oblivion!\n\nReleased",
+          "role": "assistant"
+        }
+      }
+    ],
+    "created": 1715459876,
+    "model": "groq/llama3-8b-8192",
+    "object": "chat.completion",
+    "system_fingerprint": "fp_179b0f92c9",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 12,
+      "total_tokens": 22
+    }
+  },
+  {
+    "id": "chatcmpl-9NnldUfFLmVquFHSX4yAtjCw8PGei",
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "message": {
+          "content": "TES4 could refer to The Elder Scrolls IV:",
+          "role": "assistant"
+        }
+      }
+    ],
+    "created": 1715459877,
+    "model": "gpt-3.5-turbo-0125",
+    "object": "chat.completion",
+    "system_fingerprint": null,
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 9,
+      "total_tokens": 19
+    }
+  }
+]
+```
+
+
+</TabItem>
+</Tabs>
+
+
+
+
+
 ### Pass User LLM API Keys, Fallbacks
 Allow your end-users to pass their model list, api base, OpenAI API key (any LiteLLM supported provider) to make requests 

--- a/docs/my-website/docs/proxy/users.md
+++ b/docs/my-website/docs/proxy/users.md
@ -12,8 +12,8 @@ Requirements:

 You can set budgets at 3 levels: 
 - For the proxy 
- For a user 
- For a 'user' passed to `/chat/completions`, `/embeddings` etc
+- For an internal user 
+- For an end-user
 - For a key
 - For a key (model specific budgets)

@ -58,7 +58,7 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
 }'
 ```
 </TabItem>
-<TabItem value="per-user" label="For User">
+<TabItem value="per-user" label="For Internal User">

 Apply a budget across multiple keys.

@ -165,12 +165,12 @@ curl --location 'http://localhost:4000/team/new' \
 }
 ```
 </TabItem>
-<TabItem value="per-user-chat" label="For 'user' passed to /chat/completions">
+<TabItem value="per-user-chat" label="For End User">

 Use this to budget `user` passed to `/chat/completions`, **without needing to create a key for every user**

 **Step 1. Modify config.yaml**
-Define `litellm.max_user_budget`
+Define `litellm.max_end_user_budget`
 ```yaml
 general_settings:
  master_key: sk-1234
@ -328,7 +328,7 @@ You can set:
 - max parallel requests

 <Tabs>
-<TabItem value="per-user" label="Per User">
+<TabItem value="per-user" label="Per Internal User">

 Use `/user/new`, to persist rate limits across multiple keys.

@ -408,7 +408,7 @@ curl --location 'http://localhost:4000/user/new' \
 ```


-## Create new keys for existing user
+## Create new keys for existing internal user

 Just include user_id in the `/key/generate` request.

--- a/docs/my-website/docs/routing.md
+++ b/docs/my-website/docs/routing.md
@ -95,8 +95,8 @@ print(response)
 - `router.image_generation()` - completion calls in OpenAI `/v1/images/generations` endpoint format
 - `router.aimage_generation()` - async image generation calls

-### Advanced - Routing Strategies
-#### Routing Strategies - Weighted Pick, Rate Limit Aware, Least Busy, Latency Based
+## Advanced - Routing Strategies
+#### Routing Strategies - Weighted Pick, Rate Limit Aware, Least Busy, Latency Based, Cost Based

 Router provides 4 strategies for routing your calls across multiple deployments: 

@ -278,8 +278,38 @@ router_settings:
 	routing_strategy_args: {"ttl": 10}
 ```

+### Set Lowest Latency Buffer
+
+Set a buffer within which deployments are candidates for making calls to. 
+
+E.g. 
+
+if you have 5 deployments
+
+```
+https://litellm-prod-1.openai.azure.com/: 0.07s
+https://litellm-prod-2.openai.azure.com/: 0.1s
+https://litellm-prod-3.openai.azure.com/: 0.1s
+https://litellm-prod-4.openai.azure.com/: 0.1s
+https://litellm-prod-5.openai.azure.com/: 4.66s
+```
+
+to prevent initially overloading `prod-1`, with all requests - we can set a buffer of 50%, to consider deployments `prod-2, prod-3, prod-4`. 
+
+**In Router**
+```python 
+router = Router(..., routing_strategy_args={"lowest_latency_buffer": 0.5})
+```
+
+**In Proxy**
+
+```yaml
+router_settings:
+	routing_strategy_args: {"lowest_latency_buffer": 0.5}
+```
+
 </TabItem>
-<TabItem value="simple-shuffle" label="(Default) Weighted Pick">
+<TabItem value="simple-shuffle" label="(Default) Weighted Pick (Async)">

 **Default** Picks a deployment based on the provided **Requests per minute (rpm) or Tokens per minute (tpm)**

@ -437,12 +467,136 @@ async def router_acompletion():
 asyncio.run(router_acompletion())
 ```

+</TabItem>
+<TabItem value="lowest-cost" label="Lowest Cost Routing (Async)">
+
+Picks a deployment based on the lowest cost
+
+How this works:
+- Get all healthy deployments
+- Select all deployments that are under their provided `rpm/tpm` limits
+- For each deployment check if `litellm_param["model"]` exists in [`litellm_model_cost_map`](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json) 
+	- if deployment does not exist in `litellm_model_cost_map` -> use deployment_cost= `$1`
+- Select deployment with lowest cost
+
+```python
+from litellm import Router 
+import asyncio
+
+model_list =  [
+	{
+		"model_name": "gpt-3.5-turbo",
+		"litellm_params": {"model": "gpt-4"},
+		"model_info": {"id": "openai-gpt-4"},
+	},
+	{
+		"model_name": "gpt-3.5-turbo",
+		"litellm_params": {"model": "groq/llama3-8b-8192"},
+		"model_info": {"id": "groq-llama"},
+	},
+]
+
+# init router
+router = Router(model_list=model_list, routing_strategy="cost-based-routing")
+async def router_acompletion():
+	response = await router.acompletion(
+		model="gpt-3.5-turbo", 
+		messages=[{"role": "user", "content": "Hey, how's it going?"}]
+	)
+	print(response)
+
+	print(response._hidden_params["model_id"]) # expect groq-llama, since groq/llama has lowest cost
+	return response
+
+asyncio.run(router_acompletion())
+
+```
+
+
+#### Using Custom Input/Output pricing
+
+Set `litellm_params["input_cost_per_token"]` and `litellm_params["output_cost_per_token"]` for using custom pricing when routing
+
+```python
+model_list = [
+	{
+		"model_name": "gpt-3.5-turbo",
+		"litellm_params": {
+			"model": "azure/chatgpt-v-2",
+			"input_cost_per_token": 0.00003,
+			"output_cost_per_token": 0.00003,
+		},
+		"model_info": {"id": "chatgpt-v-experimental"},
+	},
+	{
+		"model_name": "gpt-3.5-turbo",
+		"litellm_params": {
+			"model": "azure/chatgpt-v-1",
+			"input_cost_per_token": 0.000000001,
+			"output_cost_per_token": 0.00000001,
+		},
+		"model_info": {"id": "chatgpt-v-1"},
+	},
+	{
+		"model_name": "gpt-3.5-turbo",
+		"litellm_params": {
+			"model": "azure/chatgpt-v-5",
+			"input_cost_per_token": 10,
+			"output_cost_per_token": 12,
+		},
+		"model_info": {"id": "chatgpt-v-5"},
+	},
+]
+# init router
+router = Router(model_list=model_list, routing_strategy="cost-based-routing")
+async def router_acompletion():
+	response = await router.acompletion(
+		model="gpt-3.5-turbo", 
+		messages=[{"role": "user", "content": "Hey, how's it going?"}]
+	)
+	print(response)
+
+	print(response._hidden_params["model_id"]) # expect chatgpt-v-1, since chatgpt-v-1 has lowest cost
+	return response
+
+asyncio.run(router_acompletion())
+```
+
 </TabItem>

 </Tabs>

 ## Basic Reliability

+### Max Parallel Requests (ASYNC)
+
+Used in semaphore for async requests on router. Limit the max concurrent calls made to a deployment. Useful in high-traffic scenarios. 
+
+If tpm/rpm is set, and no max parallel request limit given, we use the RPM or calculated RPM (tpm/1000/6) as the max parallel request limit. 
+
+
+```python
+from litellm import Router 
+
+model_list = [{
+	"model_name": "gpt-4",
+	"litellm_params": {
+		"model": "azure/gpt-4",
+		...
+		"max_parallel_requests": 10 # 👈 SET PER DEPLOYMENT
+	}
+}]
+
+### OR ### 
+
+router = Router(model_list=model_list, default_max_parallel_requests=20) # 👈 SET DEFAULT MAX PARALLEL REQUESTS 
+
+
+# deployment max parallel requests > default max parallel requests
+```
+
+[**See Code**](https://github.com/BerriAI/litellm/blob/a978f2d8813c04dad34802cb95e0a0e35a3324bc/litellm/utils.py#L5605)
+
 ### Timeouts 

 The timeout set in router is for the entire length of the call, and is passed down to the completion() call level as well. 
@ -499,7 +653,9 @@ from litellm import Router
 model_list = [{...}]

 router = Router(model_list=model_list, 
-                allowed_fails=1) # cooldown model if it fails > 1 call in a minute. 
+                allowed_fails=1,      # cooldown model if it fails > 1 call in a minute. 
+				cooldown_time=100    # cooldown the deployment for 100 seconds if it num_fails > allowed_fails
+		)

 user_message = "Hello, whats the weather in San Francisco??"
 messages = [{"content": user_message, "role": "user"}]
@ -557,6 +713,57 @@ response = router.completion(model="gpt-3.5-turbo", messages=messages)
 print(f"response: {response}")
 ```

+#### Retries based on Error Type
+
+Use `RetryPolicy` if you want to set a `num_retries` based on the Exception receieved
+
+Example:
+- 4 retries for `ContentPolicyViolationError`
+- 0 retries for `RateLimitErrors` 
+
+Example Usage
+
+```python
+from litellm.router import RetryPolicy
+retry_policy = RetryPolicy(
+	ContentPolicyViolationErrorRetries=3, # run 3 retries for ContentPolicyViolationErrors
+	AuthenticationErrorRetries=0,		  # run 0 retries for AuthenticationErrorRetries
+	BadRequestErrorRetries=1,
+	TimeoutErrorRetries=2,
+	RateLimitErrorRetries=3,
+)
+
+router = litellm.Router(
+	model_list=[
+		{
+			"model_name": "gpt-3.5-turbo",  # openai model name
+			"litellm_params": {  # params for litellm completion/embedding call
+				"model": "azure/chatgpt-v-2",
+				"api_key": os.getenv("AZURE_API_KEY"),
+				"api_version": os.getenv("AZURE_API_VERSION"),
+				"api_base": os.getenv("AZURE_API_BASE"),
+			},
+		},
+		{
+			"model_name": "bad-model",  # openai model name
+			"litellm_params": {  # params for litellm completion/embedding call
+				"model": "azure/chatgpt-v-2",
+				"api_key": "bad-key",
+				"api_version": os.getenv("AZURE_API_VERSION"),
+				"api_base": os.getenv("AZURE_API_BASE"),
+			},
+		},
+	],
+	retry_policy=retry_policy,
+)
+
+response = await router.acompletion(
+	model=model,
+	messages=messages,
+)
+```
+
+
 ### Fallbacks 

 If a call fails after num_retries, fall back to another model group. 
@ -565,6 +772,8 @@ If the error is a context window exceeded error, fall back to a larger model gro

 Fallbacks are done in-order - ["gpt-3.5-turbo, "gpt-4", "gpt-4-32k"], will do 'gpt-3.5-turbo' first, then 'gpt-4', etc.

+You can also set 'default_fallbacks', in case a specific model group is misconfigured / bad.
+
 ```python
 from litellm import Router

@ -625,6 +834,7 @@ model_list = [

 router = Router(model_list=model_list, 
                fallbacks=[{"azure/gpt-3.5-turbo": ["gpt-3.5-turbo"]}], 
+				default_fallbacks=["gpt-3.5-turbo-16k"],
                context_window_fallbacks=[{"azure/gpt-3.5-turbo-context-fallback": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}],
                set_verbose=True)

@ -674,13 +884,11 @@ router = Router(model_list: Optional[list] = None,
 				 cache_responses=True)
 ```

-## Pre-Call Checks (Context Window)
+## Pre-Call Checks (Context Window, EU-Regions)

 Enable pre-call checks to filter out:
 1. deployments with context window limit < messages for a call.
-2. deployments that have exceeded rate limits when making concurrent calls. (eg. `asyncio.gather(*[
-        router.acompletion(model="gpt-3.5-turbo", messages=m) for m in list_of_messages
-    ])`)
+2. deployments outside of eu-region

 <Tabs>
 <TabItem value="sdk" label="SDK">
@ -695,10 +903,14 @@ router = Router(model_list=model_list, enable_pre_call_checks=True) # 👈 Set t

 **2. Set Model List**

-For azure deployments, set the base model. Pick the base model from [this list](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json), all the azure models start with `azure/`. 
+For context window checks on azure deployments, set the base model. Pick the base model from [this list](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json), all the azure models start with `azure/`. 

-<Tabs>
-<TabItem value="same-group" label="Same Group">
+For 'eu-region' filtering, Set 'region_name' of deployment. 
+
+**Note:** We automatically infer region_name for Vertex AI, Bedrock, and IBM WatsonxAI based on your litellm params. For Azure, set `litellm.enable_preview = True`.
+
+
+[**See Code**](https://github.com/BerriAI/litellm/blob/d33e49411d6503cb634f9652873160cd534dec96/litellm/router.py#L2958)

 ```python
 model_list = [
@ -709,10 +921,9 @@ model_list = [
                    "api_key": os.getenv("AZURE_API_KEY"),
                    "api_version": os.getenv("AZURE_API_VERSION"),
                    "api_base": os.getenv("AZURE_API_BASE"),
-                },
-				"model_info": {
+					"region_name": "eu" # 👈 SET 'EU' REGION NAME
 					"base_model": "azure/gpt-35-turbo", # 👈 (Azure-only) SET BASE MODEL
-				}
+                },
            },
            {
                "model_name": "gpt-3.5-turbo", # model group name
@ -721,54 +932,26 @@ model_list = [
                    "api_key": os.getenv("OPENAI_API_KEY"),
                },
            },
+			{
+				"model_name": "gemini-pro",
+				"litellm_params: {
+					"model": "vertex_ai/gemini-pro-1.5", 
+					"vertex_project": "adroit-crow-1234",
+					"vertex_location": "us-east1" # 👈 AUTOMATICALLY INFERS 'region_name'
+				}
+			}
        ]

 router = Router(model_list=model_list, enable_pre_call_checks=True) 
 ```

-</TabItem>
-
-<TabItem value="different-group" label="Context Window Fallbacks (Different Groups)">
-
-```python
-model_list = [
-            {
-                "model_name": "gpt-3.5-turbo-small", # model group name
-                "litellm_params": {  # params for litellm completion/embedding call
-                    "model": "azure/chatgpt-v-2",
-                    "api_key": os.getenv("AZURE_API_KEY"),
-                    "api_version": os.getenv("AZURE_API_VERSION"),
-                    "api_base": os.getenv("AZURE_API_BASE"),
-                },
-				"model_info": {
-					"base_model": "azure/gpt-35-turbo", # 👈 (Azure-only) SET BASE MODEL
-				}
-            },
-            {
-                "model_name": "gpt-3.5-turbo-large", # model group name
-                "litellm_params": {  # params for litellm completion/embedding call
-                    "model": "gpt-3.5-turbo-1106",
-                    "api_key": os.getenv("OPENAI_API_KEY"),
-                },
-            },
-            {
-                "model_name": "claude-opus", 
-                "litellm_params": {  call
-                    "model": "claude-3-opus-20240229",
-                    "api_key": os.getenv("ANTHROPIC_API_KEY"),
-                },
-            },
-        ]
-
-router = Router(model_list=model_list, enable_pre_call_checks=True, context_window_fallbacks=[{"gpt-3.5-turbo-small": ["gpt-3.5-turbo-large", "claude-opus"]}]) 
-```
-
-</TabItem>
-
-</Tabs>

 **3. Test it!**

+
+<Tabs>
+<TabItem value="context-window-check" label="Context Window Check">
+
 ```python
 """
 - Give a gpt-3.5-turbo model group with different context windows (4k vs. 16k)
@ -778,7 +961,6 @@ router = Router(model_list=model_list, enable_pre_call_checks=True, context_wind
 from litellm import Router
 import os

-try:
 model_list = [
 	{
 		"model_name": "gpt-3.5-turbo",  # model group name
@ -787,6 +969,7 @@ model_list = [
 			"api_key": os.getenv("AZURE_API_KEY"),
 			"api_version": os.getenv("AZURE_API_VERSION"),
 			"api_base": os.getenv("AZURE_API_BASE"),
+			"base_model": "azure/gpt-35-turbo",
 		},
 		"model_info": {
 			"base_model": "azure/gpt-35-turbo", 
@ -816,6 +999,59 @@ response = router.completion(
 print(f"response: {response}")
 ```
 </TabItem>
+<TabItem value="eu-region-check" label="EU Region Check">
+
+```python
+"""
+- Give 2 gpt-3.5-turbo deployments, in eu + non-eu regions
+- Make a call
+- Assert it picks the eu-region model
+"""
+
+from litellm import Router
+import os
+
+model_list = [
+	{
+		"model_name": "gpt-3.5-turbo",  # model group name
+		"litellm_params": {  # params for litellm completion/embedding call
+			"model": "azure/chatgpt-v-2",
+			"api_key": os.getenv("AZURE_API_KEY"),
+			"api_version": os.getenv("AZURE_API_VERSION"),
+			"api_base": os.getenv("AZURE_API_BASE"),
+			"region_name": "eu"
+		},
+		"model_info": {
+			"id": "1"
+		}
+	},
+	{
+		"model_name": "gpt-3.5-turbo",  # model group name
+		"litellm_params": {  # params for litellm completion/embedding call
+			"model": "gpt-3.5-turbo-1106",
+			"api_key": os.getenv("OPENAI_API_KEY"),
+		},
+		"model_info": {
+			"id": "2"
+		}
+	},
+]
+
+router = Router(model_list=model_list, enable_pre_call_checks=True) 
+
+response = router.completion(
+	model="gpt-3.5-turbo",
+	messages=[{"role": "user", "content": "Who was Alexander?"}],
+)
+
+print(f"response: {response}")
+
+print(f"response id: {response._hidden_params['model_id']}")
+```
+
+</TabItem>
+</Tabs>
+</TabItem>
 <TabItem value="proxy" label="Proxy">

 :::info
@ -881,6 +1117,46 @@ async def test_acompletion_caching_on_router_caching_groups():
 asyncio.run(test_acompletion_caching_on_router_caching_groups())
 ```

+## Alerting 🚨
+
+Send alerts to slack / your webhook url for the following events
+- LLM API Exceptions
+- Slow LLM Responses
+
+Get a slack webhook url from https://api.slack.com/messaging/webhooks
+
+#### Usage
+Initialize an `AlertingConfig` and pass it to `litellm.Router`. The following code will trigger an alert because `api_key=bad-key` which is invalid
+
+```python
+from litellm.router import AlertingConfig
+import litellm
+import os
+
+router = litellm.Router(
+	model_list=[
+		{
+			"model_name": "gpt-3.5-turbo",
+			"litellm_params": {
+				"model": "gpt-3.5-turbo",
+				"api_key": "bad_key",
+			},
+		}
+	],
+	alerting_config= AlertingConfig(
+		alerting_threshold=10,                        # threshold for slow / hanging llm responses (in seconds). Defaults to 300 seconds
+		webhook_url= os.getenv("SLACK_WEBHOOK_URL")   # webhook you want to send alerts to
+	),
+)
+try:
+	await router.acompletion(
+		model="gpt-3.5-turbo",
+		messages=[{"role": "user", "content": "Hey, how's it going?"}],
+	)
+except:
+	pass
+```
+
 ## Track cost for Azure Deployments

 **Problem**: Azure returns `gpt-4` in the response when `azure/gpt-4-1106-preview` is used. This leads to inaccurate cost tracking
@ -1038,10 +1314,11 @@ def __init__(
 	num_retries: int = 0,
 	timeout: Optional[float] = None,
 	default_litellm_params={},  # default params for Router.chat.completion.create
-	fallbacks: List = [],
+	fallbacks: Optional[List] = None,
+	default_fallbacks: Optional[List] = None
 	allowed_fails: Optional[int] = None, # Number of times a deployment can failbefore being added to cooldown
 	cooldown_time: float = 1,  # (seconds) time to cooldown a deployment after failure
-	context_window_fallbacks: List = [],
+	context_window_fallbacks: Optional[List] = None,
 	model_group_alias: Optional[dict] = {},
 	retry_after: int = 0,  # (min) time to wait before retrying a failed request
 	routing_strategy: Literal[
@ -1049,6 +1326,7 @@ def __init__(
 		"least-busy",
 		"usage-based-routing",
 		"latency-based-routing",
+		"cost-based-routing",
 	] = "simple-shuffle",

 	## DEBUGGING ##
--- a/docs/my-website/docs/set_keys.md
+++ b/docs/my-website/docs/set_keys.md
@ -5,6 +5,9 @@ LiteLLM allows you to specify the following:
 * API Base
 * API Version
 * API Type
+* Project
+* Location
+* Token

 Useful Helper functions: 
 * [`check_valid_key()`](#check_valid_key)
@ -43,6 +46,24 @@ os.environ['AZURE_API_TYPE'] = "azure" # [OPTIONAL]
 os.environ['OPENAI_API_BASE'] = "https://openai-gpt-4-test2-v-12.openai.azure.com/"
 ```

+### Setting Project, Location, Token
+
+For cloud providers:
+- Azure
+- Bedrock
+- GCP
+- Watson AI 
+
+you might need to set additional parameters. LiteLLM provides a common set of params, that we map across all providers. 
+
+|      | LiteLLM param | Watson       | Vertex AI    | Azure        | Bedrock      |
+|------|--------------|--------------|--------------|--------------|--------------|
+| Project | project | watsonx_project | vertex_project | n/a | n/a |
+| Region | region_name | watsonx_region_name | vertex_location | n/a | aws_region_name |
+| Token | token | watsonx_token or token | n/a | azure_ad_token | n/a |
+
+If you want, you can call them by their provider-specific params as well. 
+
 ## litellm variables

 ### litellm.api_key
--- a/docs/my-website/docusaurus.config.js
+++ b/docs/my-website/docusaurus.config.js
@ -105,6 +105,12 @@ const config = {
            label: 'Enterprise',
            to: "docs/enterprise"
          },
+          {
+            sidebarId: 'tutorialSidebar',
+            position: 'left',
+            label: '🚀 Hosted',
+            to: "docs/hosted"
+          },
          {
            href: 'https://github.com/BerriAI/litellm',
            label: 'GitHub',
--- a/docs/my-website/img/lago.jpeg
+++ b/docs/my-website/img/lago.jpeg
--- a/docs/my-website/img/lago_2.png
+++ b/docs/my-website/img/lago_2.png
--- a/docs/my-website/img/litellm_hosted_ui_add_models.png
+++ b/docs/my-website/img/litellm_hosted_ui_add_models.png
--- a/docs/my-website/img/litellm_hosted_ui_create_key.png
+++ b/docs/my-website/img/litellm_hosted_ui_create_key.png
--- a/docs/my-website/img/litellm_hosted_ui_router.png
+++ b/docs/my-website/img/litellm_hosted_ui_router.png
--- a/docs/my-website/img/litellm_hosted_usage_dashboard.png
+++ b/docs/my-website/img/litellm_hosted_usage_dashboard.png
--- a/docs/my-website/img/openmeter.png
+++ b/docs/my-website/img/openmeter.png
--- a/docs/my-website/img/openmeter_img_2.png
+++ b/docs/my-website/img/openmeter_img_2.png
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -39,13 +39,21 @@ const sidebars = {
        "proxy/demo",
        "proxy/configs",
        "proxy/reliability",
+        "proxy/cost_tracking",
        "proxy/users",
+        "proxy/billing",
        "proxy/user_keys",
        "proxy/enterprise",
        "proxy/virtual_keys",
+        "proxy/alerting",
+        {
+          type: "category",
+          label: "Logging",
+          items: ["proxy/logging", "proxy/streaming_logging"],
+        },
        "proxy/team_based_routing",
+        "proxy/customer_routing",
        "proxy/ui",
-        "proxy/cost_tracking",
        "proxy/token_auth",
        {
          type: "category",
@ -58,12 +66,7 @@ const sidebars = {
        "proxy/pii_masking",
        "proxy/prompt_injection",
        "proxy/caching",
-        {
-          type: "category",
-          label: "Logging, Alerting",
-          items: ["proxy/logging", "proxy/alerting", "proxy/streaming_logging"],
-        },
-        "proxy/grafana_metrics",
+        "proxy/prometheus",
        "proxy/call_hooks",
        "proxy/rules",
        "proxy/cli", 
@ -86,6 +89,7 @@ const sidebars = {
        "completion/stream",
        "completion/message_trimming",
        "completion/function_call",
+        "completion/vision",
        "completion/model_alias",
        "completion/batching",
        "completion/mock_requests",
@ -115,6 +119,7 @@ const sidebars = {
      },
      items: [
        "providers/openai", 
+        "providers/text_completion_openai",
        "providers/openai_compatible",
        "providers/azure", 
        "providers/azure_ai", 
@ -128,9 +133,13 @@ const sidebars = {
        "providers/cohere", 
        "providers/anyscale",
        "providers/huggingface", 
+        "providers/watsonx",
+        "providers/predibase",
+        "providers/triton-inference-server",
        "providers/ollama", 
        "providers/perplexity", 
        "providers/groq", 
+        "providers/deepseek", 
        "providers/fireworks_ai", 
        "providers/vllm", 
        "providers/xinference", 
@ -146,6 +155,7 @@ const sidebars = {
        "providers/openrouter", 
        "providers/custom_openai_proxy",
        "providers/petals",
+        
      ],
    },
    "proxy/custom_pricing",
@ -166,19 +176,22 @@ const sidebars = {
        "observability/custom_callback",
        "observability/langfuse_integration",
        "observability/sentry",
+        "observability/lago",
+        "observability/openmeter",
        "observability/promptlayer_integration",
        "observability/wandb_integration",
        "observability/langsmith_integration",
        "observability/slack_integration",
        "observability/traceloop_integration",
-        "observability/lunary_integration",
        "observability/athina_integration",
+        "observability/lunary_integration",
+        "observability/greenscale_integration",
        "observability/helicone_integration",
        "observability/supabase_integration",
        `observability/telemetry`,
      ],
    },
-    "caching/redis_cache",
+    "caching/all_caches",
    {
      type: "category",
      label: "Tutorials",
--- a/docs/my-website/src/pages/token_usage.md
+++ b/docs/my-website/src/pages/token_usage.md
@ -16,7 +16,7 @@ However, we also expose 3 public helper functions to calculate token usage acros
 ```python
 from litellm import token_counter

-messages = [{"user": "role", "content": "Hey, how's it going"}]
+messages = [{"role": "user", "content": "Hey, how's it going"}]
 print(token_counter(model="gpt-3.5-turbo", messages=messages))
 ```

--- a/enterprise/enterprise_callbacks/generic_api_callback.py
+++ b/enterprise/enterprise_callbacks/generic_api_callback.py
@ -10,7 +10,6 @@ from litellm.caching import DualCache

 from typing import Literal, Union

-dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback


@ -19,8 +18,6 @@ import traceback

 import dotenv, os
 import requests
-
-dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
 import datetime, subprocess, sys
 import litellm, uuid
--- a/enterprise/utils.py
+++ b/enterprise/utils.py
@ -1,6 +1,7 @@
 # Enterprise Proxy Util Endpoints
 from litellm._logging import verbose_logger
 import collections
+from datetime import datetime


 async def get_spend_by_tags(start_date=None, end_date=None, prisma_client=None):
@ -18,26 +19,33 @@ async def get_spend_by_tags(start_date=None, end_date=None, prisma_client=None):
    return response


-async def ui_get_spend_by_tags(start_date=None, end_date=None, prisma_client=None):
-    response = await prisma_client.db.query_raw(
-        """
+async def ui_get_spend_by_tags(start_date: str, end_date: str, prisma_client):
+
+    sql_query = """
        SELECT
        jsonb_array_elements_text(request_tags) AS individual_request_tag,
        DATE(s."startTime") AS spend_date,
        COUNT(*) AS log_count,
        SUM(spend) AS total_spend
        FROM "LiteLLM_SpendLogs" s
-        WHERE s."startTime" >= current_date - interval '30 days'
+        WHERE
+            DATE(s."startTime") >= $1::date
+            AND DATE(s."startTime") <= $2::date
        GROUP BY individual_request_tag, spend_date
-        ORDER BY spend_date;
-        """
+        ORDER BY spend_date
+        LIMIT 100;
+    """
+    response = await prisma_client.db.query_raw(
+        sql_query,
+        start_date,
+        end_date,
    )

    # print("tags - spend")
    # print(response)
    # Bar Chart 1 - Spend per tag - Top 10 tags by spend
-    total_spend_per_tag = collections.defaultdict(float)
-    total_requests_per_tag = collections.defaultdict(int)
+    total_spend_per_tag: collections.defaultdict = collections.defaultdict(float)
+    total_requests_per_tag: collections.defaultdict = collections.defaultdict(int)
    for row in response:
        tag_name = row["individual_request_tag"]
        tag_spend = row["total_spend"]
@ -49,15 +57,18 @@ async def ui_get_spend_by_tags(start_date=None, end_date=None, prisma_client=Non
    # convert to ui format
    ui_tags = []
    for tag in sorted_tags:
+        current_spend = tag[1]
+        if current_spend is not None and isinstance(current_spend, float):
+            current_spend = round(current_spend, 4)
        ui_tags.append(
            {
                "name": tag[0],
-                "value": tag[1],
+                "spend": current_spend,
                "log_count": total_requests_per_tag[tag[0]],
            }
        )

-    return {"top_10_tags": ui_tags}
+    return {"spend_per_tag": ui_tags}


 async def view_spend_logs_from_clickhouse(
@ -291,7 +302,7 @@ def _create_clickhouse_aggregate_tables(client=None, table_names=[]):


 def _forecast_daily_cost(data: list):
-    import requests
+    import requests  # type: ignore
    from datetime import datetime, timedelta

    if len(data) == 0:
--- a/index.yaml
+++ b/index.yaml
@ -0,0 +1,108 @@
+apiVersion: v1
+entries:
+  litellm-helm:
+  - apiVersion: v2
+    appVersion: v1.35.38
+    created: "2024-05-06T10:22:24.384392-07:00"
+    dependencies:
+    - condition: db.deployStandalone
+      name: postgresql
+      repository: oci://registry-1.docker.io/bitnamicharts
+      version: '>=13.3.0'
+    - condition: redis.enabled
+      name: redis
+      repository: oci://registry-1.docker.io/bitnamicharts
+      version: '>=18.0.0'
+    description: Call all LLM APIs using the OpenAI format
+    digest: 60f0cfe9e7c1087437cb35f6fb7c43c3ab2be557b6d3aec8295381eb0dfa760f
+    name: litellm-helm
+    type: application
+    urls:
+    - litellm-helm-0.2.0.tgz
+    version: 0.2.0
+  postgresql:
+  - annotations:
+      category: Database
+      images: |
+        - name: os-shell
+          image: docker.io/bitnami/os-shell:12-debian-12-r16
+        - name: postgres-exporter
+          image: docker.io/bitnami/postgres-exporter:0.15.0-debian-12-r14
+        - name: postgresql
+          image: docker.io/bitnami/postgresql:16.2.0-debian-12-r6
+      licenses: Apache-2.0
+    apiVersion: v2
+    appVersion: 16.2.0
+    created: "2024-05-06T10:22:24.387717-07:00"
+    dependencies:
+    - name: common
+      repository: oci://registry-1.docker.io/bitnamicharts
+      tags:
+      - bitnami-common
+      version: 2.x.x
+    description: PostgreSQL (Postgres) is an open source object-relational database
+      known for reliability and data integrity. ACID-compliant, it supports foreign
+      keys, joins, views, triggers and stored procedures.
+    digest: 3c8125526b06833df32e2f626db34aeaedb29d38f03d15349db6604027d4a167
+    home: https://bitnami.com
+    icon: https://bitnami.com/assets/stacks/postgresql/img/postgresql-stack-220x234.png
+    keywords:
+    - postgresql
+    - postgres
+    - database
+    - sql
+    - replication
+    - cluster
+    maintainers:
+    - name: VMware, Inc.
+      url: https://github.com/bitnami/charts
+    name: postgresql
+    sources:
+    - https://github.com/bitnami/charts/tree/main/bitnami/postgresql
+    urls:
+    - charts/postgresql-14.3.1.tgz
+    version: 14.3.1
+  redis:
+  - annotations:
+      category: Database
+      images: |
+        - name: kubectl
+          image: docker.io/bitnami/kubectl:1.29.2-debian-12-r3
+        - name: os-shell
+          image: docker.io/bitnami/os-shell:12-debian-12-r16
+        - name: redis
+          image: docker.io/bitnami/redis:7.2.4-debian-12-r9
+        - name: redis-exporter
+          image: docker.io/bitnami/redis-exporter:1.58.0-debian-12-r4
+        - name: redis-sentinel
+          image: docker.io/bitnami/redis-sentinel:7.2.4-debian-12-r7
+      licenses: Apache-2.0
+    apiVersion: v2
+    appVersion: 7.2.4
+    created: "2024-05-06T10:22:24.391903-07:00"
+    dependencies:
+    - name: common
+      repository: oci://registry-1.docker.io/bitnamicharts
+      tags:
+      - bitnami-common
+      version: 2.x.x
+    description: Redis(R) is an open source, advanced key-value store. It is often
+      referred to as a data structure server since keys can contain strings, hashes,
+      lists, sets and sorted sets.
+    digest: b2fa1835f673a18002ca864c54fadac3c33789b26f6c5e58e2851b0b14a8f984
+    home: https://bitnami.com
+    icon: https://bitnami.com/assets/stacks/redis/img/redis-stack-220x234.png
+    keywords:
+    - redis
+    - keyvalue
+    - database
+    maintainers:
+    - name: VMware, Inc.
+      url: https://github.com/bitnami/charts
+    name: redis
+    sources:
+    - https://github.com/bitnami/charts/tree/main/bitnami/redis
+    urls:
+    - charts/redis-18.19.1.tgz
+    version: 18.19.1
+generated: "2024-05-06T10:22:24.375026-07:00"
--- a/litellm-helm-0.2.0.tgz
+++ b/litellm-helm-0.2.0.tgz
--- a/litellm-js/spend-logs/package-lock.json
+++ b/litellm-js/spend-logs/package-lock.json
@ -5,8 +5,8 @@
  "packages": {
    "": {
      "dependencies": {
-        "@hono/node-server": "^1.9.0",
-        "hono": "^4.1.5"
+        "@hono/node-server": "^1.10.1",
+        "hono": "^4.2.7"
      },
      "devDependencies": {
        "@types/node": "^20.11.17",
@ -382,9 +382,9 @@
      }
    },
    "node_modules/@hono/node-server": {
-      "version": "1.9.0",
-      "resolved": "https://registry.npmjs.org/@hono/node-server/-/node-server-1.9.0.tgz",
-      "integrity": "sha512-oJjk7WXBlENeHhWiMqSyxPIZ3Kmf5ZYxqdlcSIXyN8Rn50bNJsPl99G4POBS03Jxh56FdfRJ0SEnC8mAVIiavQ==",
+      "version": "1.10.1",
+      "resolved": "https://registry.npmjs.org/@hono/node-server/-/node-server-1.10.1.tgz",
+      "integrity": "sha512-5BKW25JH5PQKPDkTcIgv3yNUPtOAbnnjFFgWvIxxAY/B/ZNeYjjWoAeDmqhIiCgOAJ3Tauuw+0G+VainhuZRYQ==",
      "engines": {
        "node": ">=18.14.1"
      }
@ -463,9 +463,9 @@
      }
    },
    "node_modules/hono": {
-      "version": "4.1.5",
-      "resolved": "https://registry.npmjs.org/hono/-/hono-4.1.5.tgz",
-      "integrity": "sha512-3ChJiIoeCxvkt6vnkxJagplrt1YZg3NyNob7ssVeK2PUqEINp4q1F94HzFnvY9QE8asVmbW5kkTDlyWylfg2vg==",
+      "version": "4.2.7",
+      "resolved": "https://registry.npmjs.org/hono/-/hono-4.2.7.tgz",
+      "integrity": "sha512-k1xHi86tJnRIVvqhFMBDGFKJ8r5O+bEsT4P59ZK59r0F300Xd910/r237inVfuT/VmE86RQQffX4OYNda6dLXw==",
      "engines": {
        "node": ">=16.0.0"
      }
--- a/litellm-js/spend-logs/package.json
+++ b/litellm-js/spend-logs/package.json
@ -3,8 +3,8 @@
    "dev": "tsx watch src/index.ts"
  },
  "dependencies": {
-    "@hono/node-server": "^1.9.0",
-    "hono": "^4.1.5"
+    "@hono/node-server": "^1.10.1",
+    "hono": "^4.2.7"
  },
  "devDependencies": {
    "@types/node": "^20.11.17",
--- a/litellm/init.py
+++ b/litellm/init.py
@ -1,8 +1,12 @@
+### Hide pydantic namespace conflict warnings globally ###
+import warnings
+
+warnings.filterwarnings("ignore", message=".*conflict with protected namespace.*")
 ### INIT VARIABLES ###
 import threading, requests, os
 from typing import Callable, List, Optional, Dict, Union, Any, Literal
 from litellm.caching import Cache
-from litellm._logging import set_verbose, _turn_on_debug, verbose_logger
+from litellm._logging import set_verbose, _turn_on_debug, verbose_logger, json_logs
 from litellm.proxy._types import (
    KeyManagementSystem,
    KeyManagementSettings,
@ -11,15 +15,32 @@ from litellm.proxy._types import (
 import httpx
 import dotenv

-dotenv.load_dotenv()
+litellm_mode = os.getenv("LITELLM_MODE", "DEV")  # "PRODUCTION", "DEV"
+if litellm_mode == "DEV":
+    dotenv.load_dotenv()
 #############################################
 if set_verbose == True:
    _turn_on_debug()
 #############################################
+### Callbacks /Logging / Success / Failure Handlers ###
 input_callback: List[Union[str, Callable]] = []
 success_callback: List[Union[str, Callable]] = []
 failure_callback: List[Union[str, Callable]] = []
-callbacks: List[Callable] = []
+service_callback: List[Union[str, Callable]] = []
+_custom_logger_compatible_callbacks_literal = Literal["lago", "openmeter"]
+callbacks: List[Union[Callable, _custom_logger_compatible_callbacks_literal]] = []
+_langfuse_default_tags: Optional[
+    List[
+        Literal[
+            "user_api_key_alias",
+            "user_api_key_user_id",
+            "user_api_key_user_email",
+            "user_api_key_team_alias",
+            "semantic-similarity",
+            "proxy_base_url",
+        ]
+    ]
+] = None
 _async_input_callback: List[Callable] = (
    []
 )  # internal variable - async custom callbacks are routed here.
@ -31,6 +52,9 @@ _async_failure_callback: List[Callable] = (
 )  # internal variable - async custom callbacks are routed here.
 pre_call_rules: List[Callable] = []
 post_call_rules: List[Callable] = []
+turn_off_message_logging: Optional[bool] = False
+## end of callbacks #############
+
 email: Optional[str] = (
    None  # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
 )
@ -42,24 +66,34 @@ max_tokens = 256  # OpenAI Defaults
 drop_params = False
 modify_params = False
 retry = True
+### AUTH ###
 api_key: Optional[str] = None
 openai_key: Optional[str] = None
 azure_key: Optional[str] = None
 anthropic_key: Optional[str] = None
 replicate_key: Optional[str] = None
 cohere_key: Optional[str] = None
+clarifai_key: Optional[str] = None
 maritalk_key: Optional[str] = None
 ai21_key: Optional[str] = None
+ollama_key: Optional[str] = None
 openrouter_key: Optional[str] = None
+predibase_key: Optional[str] = None
 huggingface_key: Optional[str] = None
 vertex_project: Optional[str] = None
 vertex_location: Optional[str] = None
+predibase_tenant_id: Optional[str] = None
 togetherai_api_key: Optional[str] = None
 cloudflare_api_key: Optional[str] = None
 baseten_key: Optional[str] = None
 aleph_alpha_key: Optional[str] = None
 nlp_cloud_key: Optional[str] = None
+common_cloud_provider_auth_params: dict = {
+    "params": ["project", "region_name", "token"],
+    "providers": ["vertex_ai", "bedrock", "watsonx", "azure"],
+}
 use_client: bool = False
+ssl_verify: bool = True
 disable_streaming_logging: bool = False
 ### GUARDRAILS ###
 llamaguard_model_name: Optional[str] = None
@ -70,6 +104,9 @@ blocked_user_list: Optional[Union[str, List]] = None
 banned_keywords_list: Optional[Union[str, List]] = None
 llm_guard_mode: Literal["all", "key-specific", "request-specific"] = "all"
 ##################
+### PREVIEW FEATURES ###
+enable_preview_features: bool = False
+##################
 logging: bool = True
 caching: bool = (
    False  # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
@ -184,6 +221,7 @@ max_end_user_budget: Optional[float] = None
 #### RELIABILITY ####
 request_timeout: Optional[float] = 6000
 num_retries: Optional[int] = None  # per model endpoint
+default_fallbacks: Optional[List] = None
 fallbacks: Optional[List] = None
 context_window_fallbacks: Optional[List] = None
 allowed_fails: int = 0
@ -281,6 +319,7 @@ aleph_alpha_models: List = []
 bedrock_models: List = []
 deepinfra_models: List = []
 perplexity_models: List = []
+watsonx_models: List = []
 for key, value in model_cost.items():
    if value.get("litellm_provider") == "openai":
        open_ai_chat_completion_models.append(key)
@ -325,6 +364,8 @@ for key, value in model_cost.items():
        deepinfra_models.append(key)
    elif value.get("litellm_provider") == "perplexity":
        perplexity_models.append(key)
+    elif value.get("litellm_provider") == "watsonx":
+        watsonx_models.append(key)

 # known openai compatible endpoints - we'll eventually move this list to the model_prices_and_context_window.json dictionary
 openai_compatible_endpoints: List = [
@ -333,6 +374,7 @@ openai_compatible_endpoints: List = [
    "api.deepinfra.com/v1/openai",
    "api.mistral.ai/v1",
    "api.groq.com/openai/v1",
+    "api.deepseek.com/v1",
    "api.together.xyz/v1",
 ]

@ -341,6 +383,7 @@ openai_compatible_providers: List = [
    "anyscale",
    "mistral",
    "groq",
+    "deepseek",
    "deepinfra",
    "perplexity",
    "xinference",
@ -365,6 +408,73 @@ replicate_models: List = [
    "replit/replit-code-v1-3b:b84f4c074b807211cd75e3e8b1589b6399052125b4c27106e43d47189e8415ad",
 ]

+clarifai_models: List = [
+    "clarifai/meta.Llama-3.Llama-3-8B-Instruct",
+    "clarifai/gcp.generate.gemma-1_1-7b-it",
+    "clarifai/mistralai.completion.mixtral-8x22B",
+    "clarifai/cohere.generate.command-r-plus",
+    "clarifai/databricks.drbx.dbrx-instruct",
+    "clarifai/mistralai.completion.mistral-large",
+    "clarifai/mistralai.completion.mistral-medium",
+    "clarifai/mistralai.completion.mistral-small",
+    "clarifai/mistralai.completion.mixtral-8x7B-Instruct-v0_1",
+    "clarifai/gcp.generate.gemma-2b-it",
+    "clarifai/gcp.generate.gemma-7b-it",
+    "clarifai/deci.decilm.deciLM-7B-instruct",
+    "clarifai/mistralai.completion.mistral-7B-Instruct",
+    "clarifai/gcp.generate.gemini-pro",
+    "clarifai/anthropic.completion.claude-v1",
+    "clarifai/anthropic.completion.claude-instant-1_2",
+    "clarifai/anthropic.completion.claude-instant",
+    "clarifai/anthropic.completion.claude-v2",
+    "clarifai/anthropic.completion.claude-2_1",
+    "clarifai/meta.Llama-2.codeLlama-70b-Python",
+    "clarifai/meta.Llama-2.codeLlama-70b-Instruct",
+    "clarifai/openai.completion.gpt-3_5-turbo-instruct",
+    "clarifai/meta.Llama-2.llama2-7b-chat",
+    "clarifai/meta.Llama-2.llama2-13b-chat",
+    "clarifai/meta.Llama-2.llama2-70b-chat",
+    "clarifai/openai.chat-completion.gpt-4-turbo",
+    "clarifai/microsoft.text-generation.phi-2",
+    "clarifai/meta.Llama-2.llama2-7b-chat-vllm",
+    "clarifai/upstage.solar.solar-10_7b-instruct",
+    "clarifai/openchat.openchat.openchat-3_5-1210",
+    "clarifai/togethercomputer.stripedHyena.stripedHyena-Nous-7B",
+    "clarifai/gcp.generate.text-bison",
+    "clarifai/meta.Llama-2.llamaGuard-7b",
+    "clarifai/fblgit.una-cybertron.una-cybertron-7b-v2",
+    "clarifai/openai.chat-completion.GPT-4",
+    "clarifai/openai.chat-completion.GPT-3_5-turbo",
+    "clarifai/ai21.complete.Jurassic2-Grande",
+    "clarifai/ai21.complete.Jurassic2-Grande-Instruct",
+    "clarifai/ai21.complete.Jurassic2-Jumbo-Instruct",
+    "clarifai/ai21.complete.Jurassic2-Jumbo",
+    "clarifai/ai21.complete.Jurassic2-Large",
+    "clarifai/cohere.generate.cohere-generate-command",
+    "clarifai/wizardlm.generate.wizardCoder-Python-34B",
+    "clarifai/wizardlm.generate.wizardLM-70B",
+    "clarifai/tiiuae.falcon.falcon-40b-instruct",
+    "clarifai/togethercomputer.RedPajama.RedPajama-INCITE-7B-Chat",
+    "clarifai/gcp.generate.code-gecko",
+    "clarifai/gcp.generate.code-bison",
+    "clarifai/mistralai.completion.mistral-7B-OpenOrca",
+    "clarifai/mistralai.completion.openHermes-2-mistral-7B",
+    "clarifai/wizardlm.generate.wizardLM-13B",
+    "clarifai/huggingface-research.zephyr.zephyr-7B-alpha",
+    "clarifai/wizardlm.generate.wizardCoder-15B",
+    "clarifai/microsoft.text-generation.phi-1_5",
+    "clarifai/databricks.Dolly-v2.dolly-v2-12b",
+    "clarifai/bigcode.code.StarCoder",
+    "clarifai/salesforce.xgen.xgen-7b-8k-instruct",
+    "clarifai/mosaicml.mpt.mpt-7b-instruct",
+    "clarifai/anthropic.completion.claude-3-opus",
+    "clarifai/anthropic.completion.claude-3-sonnet",
+    "clarifai/gcp.generate.gemini-1_5-pro",
+    "clarifai/gcp.generate.imagen-2",
+    "clarifai/salesforce.blip.general-english-image-caption-blip-2",
+]
+
+
 huggingface_models: List = [
    "meta-llama/Llama-2-7b-hf",
    "meta-llama/Llama-2-7b-chat-hf",
@ -461,6 +571,7 @@ model_list = (
    + perplexity_models
    + maritalk_models
    + vertex_language_models
+    + watsonx_models
 )

 provider_list: List = [
@ -469,6 +580,7 @@ provider_list: List = [
    "text-completion-openai",
    "cohere",
    "cohere_chat",
+    "clarifai",
    "anthropic",
    "replicate",
    "huggingface",
@ -494,11 +606,15 @@ provider_list: List = [
    "anyscale",
    "mistral",
    "groq",
+    "deepseek",
    "maritalk",
    "voyage",
    "cloudflare",
    "xinference",
    "fireworks_ai",
+    "watsonx",
+    "triton",
+    "predibase",
    "custom",  # custom apis
 ]

@ -512,7 +628,11 @@ models_by_provider: dict = {
    "together_ai": together_ai_models,
    "baseten": baseten_models,
    "openrouter": openrouter_models,
-    "vertex_ai": vertex_chat_models + vertex_text_models,
+    "vertex_ai": vertex_chat_models
+    + vertex_text_models
+    + vertex_anthropic_models
+    + vertex_vision_models
+    + vertex_language_models,
    "ai21": ai21_models,
    "bedrock": bedrock_models,
    "petals": petals_models,
@ -520,6 +640,7 @@ models_by_provider: dict = {
    "deepinfra": deepinfra_models,
    "perplexity": perplexity_models,
    "maritalk": maritalk_models,
+    "watsonx": watsonx_models,
 }

 # mapping for those models which have larger equivalents
@ -570,7 +691,6 @@ all_embedding_models = (
 ####### IMAGE GENERATION MODELS ###################
 openai_image_generation_models = ["dall-e-2", "dall-e-3"]

-
 from .timeout import timeout
 from .utils import (
    client,
@ -578,10 +698,13 @@ from .utils import (
    get_optional_params,
    modify_integration,
    token_counter,
+    create_pretrained_tokenizer,
+    create_tokenizer,
    cost_per_token,
    completion_cost,
    supports_function_calling,
    supports_parallel_function_calling,
+    supports_vision,
    get_litellm_params,
    Logging,
    acreate,
@ -600,12 +723,15 @@ from .utils import (
    get_secret,
    get_supported_openai_params,
    get_api_base,
+    get_first_chars_messages,
 )
 from .llms.huggingface_restapi import HuggingfaceConfig
 from .llms.anthropic import AnthropicConfig
+from .llms.predibase import PredibaseConfig
 from .llms.anthropic_text import AnthropicTextConfig
 from .llms.replicate import ReplicateConfig
 from .llms.cohere import CohereConfig
+from .llms.clarifai import ClarifaiConfig
 from .llms.ai21 import AI21Config
 from .llms.together_ai import TogetherAIConfig
 from .llms.cloudflare import CloudflareConfig
@ -620,6 +746,7 @@ from .llms.sagemaker import SagemakerConfig
 from .llms.ollama import OllamaConfig
 from .llms.ollama_chat import OllamaChatConfig
 from .llms.maritalk import MaritTalkConfig
+from .llms.bedrock_httpx import AmazonCohereChatConfig
 from .llms.bedrock import (
    AmazonTitanConfig,
    AmazonAI21Config,
@ -629,9 +756,11 @@ from .llms.bedrock import (
    AmazonLlamaConfig,
    AmazonStabilityConfig,
    AmazonMistralConfig,
+    AmazonBedrockGlobalConfig,
 )
-from .llms.openai import OpenAIConfig, OpenAITextCompletionConfig
+from .llms.openai import OpenAIConfig, OpenAITextCompletionConfig, MistralConfig
 from .llms.azure import AzureOpenAIConfig, AzureOpenAIError
+from .llms.watsonx import IBMWatsonXAIConfig
 from .main import *  # type: ignore
 from .integrations import *
 from .exceptions import (
@ -654,3 +783,4 @@ from .exceptions import (
 from .budget_manager import BudgetManager
 from .proxy.proxy_cli import run_server
 from .router import Router
+from .assistants.main import *
--- a/litellm/_logging.py
+++ b/litellm/_logging.py
@ -1,7 +1,7 @@
 import logging

 set_verbose = False
-
+json_logs = False
 # Create a handler for the logger (you may need to adapt this based on your needs)
 handler = logging.StreamHandler()
 handler.setLevel(logging.DEBUG)
--- a/litellm/_redis.py
+++ b/litellm/_redis.py
@ -10,8 +10,8 @@
 # s/o [@Frank Colson](https://www.linkedin.com/in/frank-colson-422b9b183/) for this redis implementation
 import os
 import inspect
-import redis, litellm
-import redis.asyncio as async_redis
+import redis, litellm  # type: ignore
+import redis.asyncio as async_redis  # type: ignore
 from typing import List, Optional


@ -32,6 +32,25 @@ def _get_redis_kwargs():
    return available_args


+def _get_redis_url_kwargs(client=None):
+    if client is None:
+        client = redis.Redis.from_url
+    arg_spec = inspect.getfullargspec(redis.Redis.from_url)
+
+    # Only allow primitive arguments
+    exclude_args = {
+        "self",
+        "connection_pool",
+        "retry",
+    }
+
+    include_args = ["url"]
+
+    available_args = [x for x in arg_spec.args if x not in exclude_args] + include_args
+
+    return available_args
+
+
 def _get_redis_env_kwarg_mapping():
    PREFIX = "REDIS_"

@ -91,27 +110,39 @@ def _get_redis_client_logic(**env_overrides):
        redis_kwargs.pop("password", None)
    elif "host" not in redis_kwargs or redis_kwargs["host"] is None:
        raise ValueError("Either 'host' or 'url' must be specified for redis.")
-    litellm.print_verbose(f"redis_kwargs: {redis_kwargs}")
+    # litellm.print_verbose(f"redis_kwargs: {redis_kwargs}")
    return redis_kwargs


 def get_redis_client(**env_overrides):
    redis_kwargs = _get_redis_client_logic(**env_overrides)
    if "url" in redis_kwargs and redis_kwargs["url"] is not None:
-        redis_kwargs.pop(
-            "connection_pool", None
-        )  # redis.from_url doesn't support setting your own connection pool
-        return redis.Redis.from_url(**redis_kwargs)
+        args = _get_redis_url_kwargs()
+        url_kwargs = {}
+        for arg in redis_kwargs:
+            if arg in args:
+                url_kwargs[arg] = redis_kwargs[arg]
+
+        return redis.Redis.from_url(**url_kwargs)
    return redis.Redis(**redis_kwargs)


 def get_redis_async_client(**env_overrides):
    redis_kwargs = _get_redis_client_logic(**env_overrides)
    if "url" in redis_kwargs and redis_kwargs["url"] is not None:
-        redis_kwargs.pop(
-            "connection_pool", None
-        )  # redis.from_url doesn't support setting your own connection pool
-        return async_redis.Redis.from_url(**redis_kwargs)
+        args = _get_redis_url_kwargs(client=async_redis.Redis.from_url)
+        url_kwargs = {}
+        for arg in redis_kwargs:
+            if arg in args:
+                url_kwargs[arg] = redis_kwargs[arg]
+            else:
+                litellm.print_verbose(
+                    "REDIS: ignoring argument: {}. Not an allowed async_redis.Redis.from_url arg.".format(
+                        arg
+                    )
+                )
+        return async_redis.Redis.from_url(**url_kwargs)
+
    return async_redis.Redis(
        socket_timeout=5,
        **redis_kwargs,
@ -124,4 +155,9 @@ def get_redis_connection_pool(**env_overrides):
        return async_redis.BlockingConnectionPool.from_url(
            timeout=5, url=redis_kwargs["url"]
        )
+    connection_class = async_redis.Connection
+    if "ssl" in redis_kwargs and redis_kwargs["ssl"] is not None:
+        connection_class = async_redis.SSLConnection
+        redis_kwargs.pop("ssl", None)
+        redis_kwargs["connection_class"] = connection_class
    return async_redis.BlockingConnectionPool(timeout=5, **redis_kwargs)
--- a/litellm/_service_logger.py
+++ b/litellm/_service_logger.py
@ -0,0 +1,130 @@
+import litellm, traceback
+from litellm.proxy._types import UserAPIKeyAuth
+from .types.services import ServiceTypes, ServiceLoggerPayload
+from .integrations.prometheus_services import PrometheusServicesLogger
+from .integrations.custom_logger import CustomLogger
+from datetime import timedelta
+from typing import Union
+
+
+class ServiceLogging(CustomLogger):
+    """
+    Separate class used for monitoring health of litellm-adjacent services (redis/postgres).
+    """
+
+    def __init__(self, mock_testing: bool = False) -> None:
+        self.mock_testing = mock_testing
+        self.mock_testing_sync_success_hook = 0
+        self.mock_testing_async_success_hook = 0
+        self.mock_testing_sync_failure_hook = 0
+        self.mock_testing_async_failure_hook = 0
+        if "prometheus_system" in litellm.service_callback:
+            self.prometheusServicesLogger = PrometheusServicesLogger()
+
+    def service_success_hook(
+        self, service: ServiceTypes, duration: float, call_type: str
+    ):
+        """
+        [TODO] Not implemented for sync calls yet. V0 is focused on async monitoring (used by proxy).
+        """
+        if self.mock_testing:
+            self.mock_testing_sync_success_hook += 1
+
+    def service_failure_hook(
+        self, service: ServiceTypes, duration: float, error: Exception, call_type: str
+    ):
+        """
+        [TODO] Not implemented for sync calls yet. V0 is focused on async monitoring (used by proxy).
+        """
+        if self.mock_testing:
+            self.mock_testing_sync_failure_hook += 1
+
+    async def async_service_success_hook(
+        self, service: ServiceTypes, duration: float, call_type: str
+    ):
+        """
+        - For counting if the redis, postgres call is successful
+        """
+        if self.mock_testing:
+            self.mock_testing_async_success_hook += 1
+
+        payload = ServiceLoggerPayload(
+            is_error=False,
+            error=None,
+            service=service,
+            duration=duration,
+            call_type=call_type,
+        )
+        for callback in litellm.service_callback:
+            if callback == "prometheus_system":
+                await self.prometheusServicesLogger.async_service_success_hook(
+                    payload=payload
+                )
+
+    async def async_service_failure_hook(
+        self,
+        service: ServiceTypes,
+        duration: float,
+        error: Union[str, Exception],
+        call_type: str,
+    ):
+        """
+        - For counting if the redis, postgres call is unsuccessful
+        """
+        if self.mock_testing:
+            self.mock_testing_async_failure_hook += 1
+
+        error_message = ""
+        if isinstance(error, Exception):
+            error_message = str(error)
+        elif isinstance(error, str):
+            error_message = error
+
+        payload = ServiceLoggerPayload(
+            is_error=True,
+            error=error_message,
+            service=service,
+            duration=duration,
+            call_type=call_type,
+        )
+        for callback in litellm.service_callback:
+            if callback == "prometheus_system":
+                if self.prometheusServicesLogger is None:
+                    self.prometheusServicesLogger = self.prometheusServicesLogger()
+                await self.prometheusServicesLogger.async_service_failure_hook(
+                    payload=payload
+                )
+
+    async def async_post_call_failure_hook(
+        self, original_exception: Exception, user_api_key_dict: UserAPIKeyAuth
+    ):
+        """
+        Hook to track failed litellm-service calls
+        """
+        return await super().async_post_call_failure_hook(
+            original_exception, user_api_key_dict
+        )
+
+    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
+        """
+        Hook to track latency for litellm proxy llm api calls
+        """
+        try:
+            _duration = end_time - start_time
+            if isinstance(_duration, timedelta):
+                _duration = _duration.total_seconds()
+            elif isinstance(_duration, float):
+                pass
+            else:
+                raise Exception(
+                    "Duration={} is not a float or timedelta object. type={}".format(
+                        _duration, type(_duration)
+                    )
+                )  # invalid _duration value
+            await self.async_service_success_hook(
+                service=ServiceTypes.LITELLM,
+                duration=_duration,
+                call_type=kwargs["call_type"],
+            )
+        except Exception as e:
+            raise e
--- a/litellm/assistants/main.py
+++ b/litellm/assistants/main.py
@ -0,0 +1,495 @@
+# What is this?
+## Main file for assistants API logic
+from typing import Iterable
+import os
+import litellm
+from openai import OpenAI
+from litellm import client
+from litellm.utils import supports_httpx_timeout
+from ..llms.openai import OpenAIAssistantsAPI
+from ..types.llms.openai import *
+from ..types.router import *
+
+####### ENVIRONMENT VARIABLES ###################
+openai_assistants_api = OpenAIAssistantsAPI()
+
+### ASSISTANTS ###
+
+
+def get_assistants(
+    custom_llm_provider: Literal["openai"],
+    client: Optional[OpenAI] = None,
+    **kwargs,
+) -> SyncCursorPage[Assistant]:
+    optional_params = GenericLiteLLMParams(**kwargs)
+
+    ### TIMEOUT LOGIC ###
+    timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
+    # set timeout for 10 minutes by default
+
+    if (
+        timeout is not None
+        and isinstance(timeout, httpx.Timeout)
+        and supports_httpx_timeout(custom_llm_provider) == False
+    ):
+        read_timeout = timeout.read or 600
+        timeout = read_timeout  # default 10 min timeout
+    elif timeout is not None and not isinstance(timeout, httpx.Timeout):
+        timeout = float(timeout)  # type: ignore
+    elif timeout is None:
+        timeout = 600.0
+
+    response: Optional[SyncCursorPage[Assistant]] = None
+    if custom_llm_provider == "openai":
+        api_base = (
+            optional_params.api_base  # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
+            or litellm.api_base
+            or os.getenv("OPENAI_API_BASE")
+            or "https://api.openai.com/v1"
+        )
+        organization = (
+            optional_params.organization
+            or litellm.organization
+            or os.getenv("OPENAI_ORGANIZATION", None)
+            or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
+        )
+        # set API KEY
+        api_key = (
+            optional_params.api_key
+            or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
+            or litellm.openai_key
+            or os.getenv("OPENAI_API_KEY")
+        )
+        response = openai_assistants_api.get_assistants(
+            api_base=api_base,
+            api_key=api_key,
+            timeout=timeout,
+            max_retries=optional_params.max_retries,
+            organization=organization,
+            client=client,
+        )
+    else:
+        raise litellm.exceptions.BadRequestError(
+            message="LiteLLM doesn't support {} for 'get_assistants'. Only 'openai' is supported.".format(
+                custom_llm_provider
+            ),
+            model="n/a",
+            llm_provider=custom_llm_provider,
+            response=httpx.Response(
+                status_code=400,
+                content="Unsupported provider",
+                request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"),  # type: ignore
+            ),
+        )
+    return response
+
+
+### THREADS ###
+
+
+def create_thread(
+    custom_llm_provider: Literal["openai"],
+    messages: Optional[Iterable[OpenAICreateThreadParamsMessage]] = None,
+    metadata: Optional[dict] = None,
+    tool_resources: Optional[OpenAICreateThreadParamsToolResources] = None,
+    client: Optional[OpenAI] = None,
+    **kwargs,
+) -> Thread:
+    """
+    - get the llm provider
+    - if openai - route it there
+    - pass through relevant params
+
+    ```
+    from litellm import create_thread
+
+    create_thread(
+        custom_llm_provider="openai",
+        ### OPTIONAL ###
+        messages =  {
+            "role": "user",
+            "content": "Hello, what is AI?"
+            },
+            {
+            "role": "user",
+            "content": "How does AI work? Explain it in simple terms."
+        }]
+    )
+    ```
+    """
+    optional_params = GenericLiteLLMParams(**kwargs)
+
+    ### TIMEOUT LOGIC ###
+    timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
+    # set timeout for 10 minutes by default
+
+    if (
+        timeout is not None
+        and isinstance(timeout, httpx.Timeout)
+        and supports_httpx_timeout(custom_llm_provider) == False
+    ):
+        read_timeout = timeout.read or 600
+        timeout = read_timeout  # default 10 min timeout
+    elif timeout is not None and not isinstance(timeout, httpx.Timeout):
+        timeout = float(timeout)  # type: ignore
+    elif timeout is None:
+        timeout = 600.0
+
+    response: Optional[Thread] = None
+    if custom_llm_provider == "openai":
+        api_base = (
+            optional_params.api_base  # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
+            or litellm.api_base
+            or os.getenv("OPENAI_API_BASE")
+            or "https://api.openai.com/v1"
+        )
+        organization = (
+            optional_params.organization
+            or litellm.organization
+            or os.getenv("OPENAI_ORGANIZATION", None)
+            or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
+        )
+        # set API KEY
+        api_key = (
+            optional_params.api_key
+            or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
+            or litellm.openai_key
+            or os.getenv("OPENAI_API_KEY")
+        )
+        response = openai_assistants_api.create_thread(
+            messages=messages,
+            metadata=metadata,
+            api_base=api_base,
+            api_key=api_key,
+            timeout=timeout,
+            max_retries=optional_params.max_retries,
+            organization=organization,
+            client=client,
+        )
+    else:
+        raise litellm.exceptions.BadRequestError(
+            message="LiteLLM doesn't support {} for 'create_thread'. Only 'openai' is supported.".format(
+                custom_llm_provider
+            ),
+            model="n/a",
+            llm_provider=custom_llm_provider,
+            response=httpx.Response(
+                status_code=400,
+                content="Unsupported provider",
+                request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"),  # type: ignore
+            ),
+        )
+    return response
+
+
+def get_thread(
+    custom_llm_provider: Literal["openai"],
+    thread_id: str,
+    client: Optional[OpenAI] = None,
+    **kwargs,
+) -> Thread:
+    """Get the thread object, given a thread_id"""
+    optional_params = GenericLiteLLMParams(**kwargs)
+
+    ### TIMEOUT LOGIC ###
+    timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
+    # set timeout for 10 minutes by default
+
+    if (
+        timeout is not None
+        and isinstance(timeout, httpx.Timeout)
+        and supports_httpx_timeout(custom_llm_provider) == False
+    ):
+        read_timeout = timeout.read or 600
+        timeout = read_timeout  # default 10 min timeout
+    elif timeout is not None and not isinstance(timeout, httpx.Timeout):
+        timeout = float(timeout)  # type: ignore
+    elif timeout is None:
+        timeout = 600.0
+
+    response: Optional[Thread] = None
+    if custom_llm_provider == "openai":
+        api_base = (
+            optional_params.api_base  # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
+            or litellm.api_base
+            or os.getenv("OPENAI_API_BASE")
+            or "https://api.openai.com/v1"
+        )
+        organization = (
+            optional_params.organization
+            or litellm.organization
+            or os.getenv("OPENAI_ORGANIZATION", None)
+            or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
+        )
+        # set API KEY
+        api_key = (
+            optional_params.api_key
+            or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
+            or litellm.openai_key
+            or os.getenv("OPENAI_API_KEY")
+        )
+        response = openai_assistants_api.get_thread(
+            thread_id=thread_id,
+            api_base=api_base,
+            api_key=api_key,
+            timeout=timeout,
+            max_retries=optional_params.max_retries,
+            organization=organization,
+            client=client,
+        )
+    else:
+        raise litellm.exceptions.BadRequestError(
+            message="LiteLLM doesn't support {} for 'get_thread'. Only 'openai' is supported.".format(
+                custom_llm_provider
+            ),
+            model="n/a",
+            llm_provider=custom_llm_provider,
+            response=httpx.Response(
+                status_code=400,
+                content="Unsupported provider",
+                request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"),  # type: ignore
+            ),
+        )
+    return response
+
+
+### MESSAGES ###
+
+
+def add_message(
+    custom_llm_provider: Literal["openai"],
+    thread_id: str,
+    role: Literal["user", "assistant"],
+    content: str,
+    attachments: Optional[List[Attachment]] = None,
+    metadata: Optional[dict] = None,
+    client: Optional[OpenAI] = None,
+    **kwargs,
+) -> OpenAIMessage:
+    ### COMMON OBJECTS ###
+    message_data = MessageData(
+        role=role, content=content, attachments=attachments, metadata=metadata
+    )
+    optional_params = GenericLiteLLMParams(**kwargs)
+
+    ### TIMEOUT LOGIC ###
+    timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
+    # set timeout for 10 minutes by default
+
+    if (
+        timeout is not None
+        and isinstance(timeout, httpx.Timeout)
+        and supports_httpx_timeout(custom_llm_provider) == False
+    ):
+        read_timeout = timeout.read or 600
+        timeout = read_timeout  # default 10 min timeout
+    elif timeout is not None and not isinstance(timeout, httpx.Timeout):
+        timeout = float(timeout)  # type: ignore
+    elif timeout is None:
+        timeout = 600.0
+
+    response: Optional[OpenAIMessage] = None
+    if custom_llm_provider == "openai":
+        api_base = (
+            optional_params.api_base  # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
+            or litellm.api_base
+            or os.getenv("OPENAI_API_BASE")
+            or "https://api.openai.com/v1"
+        )
+        organization = (
+            optional_params.organization
+            or litellm.organization
+            or os.getenv("OPENAI_ORGANIZATION", None)
+            or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
+        )
+        # set API KEY
+        api_key = (
+            optional_params.api_key
+            or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
+            or litellm.openai_key
+            or os.getenv("OPENAI_API_KEY")
+        )
+        response = openai_assistants_api.add_message(
+            thread_id=thread_id,
+            message_data=message_data,
+            api_base=api_base,
+            api_key=api_key,
+            timeout=timeout,
+            max_retries=optional_params.max_retries,
+            organization=organization,
+            client=client,
+        )
+    else:
+        raise litellm.exceptions.BadRequestError(
+            message="LiteLLM doesn't support {} for 'create_thread'. Only 'openai' is supported.".format(
+                custom_llm_provider
+            ),
+            model="n/a",
+            llm_provider=custom_llm_provider,
+            response=httpx.Response(
+                status_code=400,
+                content="Unsupported provider",
+                request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"),  # type: ignore
+            ),
+        )
+
+    return response
+
+
+def get_messages(
+    custom_llm_provider: Literal["openai"],
+    thread_id: str,
+    client: Optional[OpenAI] = None,
+    **kwargs,
+) -> SyncCursorPage[OpenAIMessage]:
+    optional_params = GenericLiteLLMParams(**kwargs)
+
+    ### TIMEOUT LOGIC ###
+    timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
+    # set timeout for 10 minutes by default
+
+    if (
+        timeout is not None
+        and isinstance(timeout, httpx.Timeout)
+        and supports_httpx_timeout(custom_llm_provider) == False
+    ):
+        read_timeout = timeout.read or 600
+        timeout = read_timeout  # default 10 min timeout
+    elif timeout is not None and not isinstance(timeout, httpx.Timeout):
+        timeout = float(timeout)  # type: ignore
+    elif timeout is None:
+        timeout = 600.0
+
+    response: Optional[SyncCursorPage[OpenAIMessage]] = None
+    if custom_llm_provider == "openai":
+        api_base = (
+            optional_params.api_base  # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
+            or litellm.api_base
+            or os.getenv("OPENAI_API_BASE")
+            or "https://api.openai.com/v1"
+        )
+        organization = (
+            optional_params.organization
+            or litellm.organization
+            or os.getenv("OPENAI_ORGANIZATION", None)
+            or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
+        )
+        # set API KEY
+        api_key = (
+            optional_params.api_key
+            or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
+            or litellm.openai_key
+            or os.getenv("OPENAI_API_KEY")
+        )
+        response = openai_assistants_api.get_messages(
+            thread_id=thread_id,
+            api_base=api_base,
+            api_key=api_key,
+            timeout=timeout,
+            max_retries=optional_params.max_retries,
+            organization=organization,
+            client=client,
+        )
+    else:
+        raise litellm.exceptions.BadRequestError(
+            message="LiteLLM doesn't support {} for 'get_messages'. Only 'openai' is supported.".format(
+                custom_llm_provider
+            ),
+            model="n/a",
+            llm_provider=custom_llm_provider,
+            response=httpx.Response(
+                status_code=400,
+                content="Unsupported provider",
+                request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"),  # type: ignore
+            ),
+        )
+
+    return response
+
+
+### RUNS ###
+
+
+def run_thread(
+    custom_llm_provider: Literal["openai"],
+    thread_id: str,
+    assistant_id: str,
+    additional_instructions: Optional[str] = None,
+    instructions: Optional[str] = None,
+    metadata: Optional[dict] = None,
+    model: Optional[str] = None,
+    stream: Optional[bool] = None,
+    tools: Optional[Iterable[AssistantToolParam]] = None,
+    client: Optional[OpenAI] = None,
+    **kwargs,
+) -> Run:
+    """Run a given thread + assistant."""
+    optional_params = GenericLiteLLMParams(**kwargs)
+
+    ### TIMEOUT LOGIC ###
+    timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
+    # set timeout for 10 minutes by default
+
+    if (
+        timeout is not None
+        and isinstance(timeout, httpx.Timeout)
+        and supports_httpx_timeout(custom_llm_provider) == False
+    ):
+        read_timeout = timeout.read or 600
+        timeout = read_timeout  # default 10 min timeout
+    elif timeout is not None and not isinstance(timeout, httpx.Timeout):
+        timeout = float(timeout)  # type: ignore
+    elif timeout is None:
+        timeout = 600.0
+
+    response: Optional[Run] = None
+    if custom_llm_provider == "openai":
+        api_base = (
+            optional_params.api_base  # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
+            or litellm.api_base
+            or os.getenv("OPENAI_API_BASE")
+            or "https://api.openai.com/v1"
+        )
+        organization = (
+            optional_params.organization
+            or litellm.organization
+            or os.getenv("OPENAI_ORGANIZATION", None)
+            or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
+        )
+        # set API KEY
+        api_key = (
+            optional_params.api_key
+            or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
+            or litellm.openai_key
+            or os.getenv("OPENAI_API_KEY")
+        )
+        response = openai_assistants_api.run_thread(
+            thread_id=thread_id,
+            assistant_id=assistant_id,
+            additional_instructions=additional_instructions,
+            instructions=instructions,
+            metadata=metadata,
+            model=model,
+            stream=stream,
+            tools=tools,
+            api_base=api_base,
+            api_key=api_key,
+            timeout=timeout,
+            max_retries=optional_params.max_retries,
+            organization=organization,
+            client=client,
+        )
+    else:
+        raise litellm.exceptions.BadRequestError(
+            message="LiteLLM doesn't support {} for 'run_thread'. Only 'openai' is supported.".format(
+                custom_llm_provider
+            ),
+            model="n/a",
+            llm_provider=custom_llm_provider,
+            response=httpx.Response(
+                status_code=400,
+                content="Unsupported provider",
+                request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"),  # type: ignore
+            ),
+        )
+    return response
--- a/litellm/budget_manager.py
+++ b/litellm/budget_manager.py
@ -10,7 +10,7 @@
 import os, json, time
 import litellm
 from litellm.utils import ModelResponse
-import requests, threading
+import requests, threading  # type: ignore
 from typing import Optional, Union, Literal


--- a/litellm/caching.py
+++ b/litellm/caching.py
@ -13,6 +13,7 @@ import json, traceback, ast, hashlib
 from typing import Optional, Literal, List, Union, Any, BinaryIO
 from openai._models import BaseModel as OpenAIObject
 from litellm._logging import verbose_logger
+from litellm.types.services import ServiceLoggerPayload, ServiceTypes
 import traceback


@ -88,6 +89,13 @@ class InMemoryCache(BaseCache):
            return_val.append(val)
        return return_val

+    def increment_cache(self, key, value: int, **kwargs) -> int:
+        # get the value
+        init_value = self.get_cache(key=key) or 0
+        value = init_value + value
+        self.set_cache(key, value, **kwargs)
+        return value
+
    async def async_get_cache(self, key, **kwargs):
        return self.get_cache(key=key, **kwargs)

@ -98,11 +106,12 @@ class InMemoryCache(BaseCache):
            return_val.append(val)
        return return_val

-    async def async_increment(self, key, value: int, **kwargs):
+    async def async_increment(self, key, value: float, **kwargs) -> float:
        # get the value
        init_value = await self.async_get_cache(key=key) or 0
        value = init_value + value
        await self.async_set_cache(key, value, **kwargs)
+        return value

    def flush_cache(self):
        self.cache_dict.clear()
@ -129,6 +138,7 @@ class RedisCache(BaseCache):
        **kwargs,
    ):
        from ._redis import get_redis_client, get_redis_connection_pool
+        from litellm._service_logger import ServiceLogging
        import redis

        redis_kwargs = {}
@ -139,18 +149,19 @@ class RedisCache(BaseCache):
        if password is not None:
            redis_kwargs["password"] = password

+        ### HEALTH MONITORING OBJECT ###
+        if kwargs.get("service_logger_obj", None) is not None and isinstance(
+            kwargs["service_logger_obj"], ServiceLogging
+        ):
+            self.service_logger_obj = kwargs.pop("service_logger_obj")
+        else:
+            self.service_logger_obj = ServiceLogging()
+
        redis_kwargs.update(kwargs)
        self.redis_client = get_redis_client(**redis_kwargs)
        self.redis_kwargs = redis_kwargs
        self.async_redis_conn_pool = get_redis_connection_pool(**redis_kwargs)

-        if "url" in redis_kwargs and redis_kwargs["url"] is not None:
-            parsed_kwargs = redis.connection.parse_url(redis_kwargs["url"])
-            redis_kwargs.update(parsed_kwargs)
-            self.redis_kwargs.update(parsed_kwargs)
-            # pop url
-            self.redis_kwargs.pop("url")
-
        # redis namespaces
        self.namespace = namespace
        # for high traffic, we store the redis results in memory and then batch write to redis
@ -162,6 +173,23 @@ class RedisCache(BaseCache):
        except Exception as e:
            pass

+        ### ASYNC HEALTH PING ###
+        try:
+            # asyncio.get_running_loop().create_task(self.ping())
+            result = asyncio.get_running_loop().create_task(self.ping())
+        except Exception as e:
+            verbose_logger.error(
+                "Error connecting to Async Redis client", extra={"error": str(e)}
+            )
+
+        ### SYNC HEALTH PING ###
+        try:
+            self.redis_client.ping()
+        except Exception as e:
+            verbose_logger.error(
+                "Error connecting to Sync Redis client", extra={"error": str(e)}
+            )
+
    def init_async_client(self):
        from ._redis import get_redis_async_client

@ -192,18 +220,101 @@ class RedisCache(BaseCache):
                f"LiteLLM Caching: set() - Got exception from REDIS : {str(e)}"
            )

+    def increment_cache(self, key, value: int, **kwargs) -> int:
+        _redis_client = self.redis_client
+        start_time = time.time()
+        try:
+            result = _redis_client.incr(name=key, amount=value)
+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.service_success_hook(
+                    service=ServiceTypes.REDIS,
+                    duration=_duration,
+                    call_type="increment_cache",
+                )
+            )
+            return result
+        except Exception as e:
+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_failure_hook(
+                    service=ServiceTypes.REDIS,
+                    duration=_duration,
+                    error=e,
+                    call_type="increment_cache",
+                )
+            )
+            verbose_logger.error(
+                "LiteLLM Redis Caching: increment_cache() - Got exception from REDIS %s, Writing value=%s",
+                str(e),
+                value,
+            )
+            traceback.print_exc()
+            raise e
+
    async def async_scan_iter(self, pattern: str, count: int = 100) -> list:
-        keys = []
-        _redis_client = self.init_async_client()
-        async with _redis_client as redis_client:
-            async for key in redis_client.scan_iter(match=pattern + "*", count=count):
-                keys.append(key)
-                if len(keys) >= count:
-                    break
-        return keys
+        start_time = time.time()
+        try:
+            keys = []
+            _redis_client = self.init_async_client()
+            async with _redis_client as redis_client:
+                async for key in redis_client.scan_iter(
+                    match=pattern + "*", count=count
+                ):
+                    keys.append(key)
+                    if len(keys) >= count:
+                        break
+
+                ## LOGGING ##
+                end_time = time.time()
+                _duration = end_time - start_time
+                asyncio.create_task(
+                    self.service_logger_obj.async_service_success_hook(
+                        service=ServiceTypes.REDIS,
+                        duration=_duration,
+                        call_type="async_scan_iter",
+                    )
+                )  # DO NOT SLOW DOWN CALL B/C OF THIS
+            return keys
+        except Exception as e:
+            # NON blocking - notify users Redis is throwing an exception
+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_failure_hook(
+                    service=ServiceTypes.REDIS,
+                    duration=_duration,
+                    error=e,
+                    call_type="async_scan_iter",
+                )
+            )
+            raise e

    async def async_set_cache(self, key, value, **kwargs):
-        _redis_client = self.init_async_client()
+        start_time = time.time()
+        try:
+            _redis_client = self.init_async_client()
+        except Exception as e:
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_failure_hook(
+                    service=ServiceTypes.REDIS, duration=_duration, error=e
+                )
+            )
+            # NON blocking - notify users Redis is throwing an exception
+            verbose_logger.error(
+                "LiteLLM Redis Caching: async set() - Got exception from REDIS %s, Writing value=%s",
+                str(e),
+                value,
+            )
+            traceback.print_exc()
+
        key = self.check_and_fix_namespace(key=key)
        async with _redis_client as redis_client:
            ttl = kwargs.get("ttl", None)
@ -215,7 +326,26 @@ class RedisCache(BaseCache):
                print_verbose(
                    f"Successfully Set ASYNC Redis Cache: key: {key}\nValue {value}\nttl={ttl}"
                )
+                end_time = time.time()
+                _duration = end_time - start_time
+                asyncio.create_task(
+                    self.service_logger_obj.async_service_success_hook(
+                        service=ServiceTypes.REDIS,
+                        duration=_duration,
+                        call_type="async_set_cache",
+                    )
+                )
            except Exception as e:
+                end_time = time.time()
+                _duration = end_time - start_time
+                asyncio.create_task(
+                    self.service_logger_obj.async_service_failure_hook(
+                        service=ServiceTypes.REDIS,
+                        duration=_duration,
+                        error=e,
+                        call_type="async_set_cache",
+                    )
+                )
                # NON blocking - notify users Redis is throwing an exception
                verbose_logger.error(
                    "LiteLLM Redis Caching: async set() - Got exception from REDIS %s, Writing value=%s",
@ -229,6 +359,11 @@ class RedisCache(BaseCache):
        Use Redis Pipelines for bulk write operations
        """
        _redis_client = self.init_async_client()
+        start_time = time.time()
+
+        print_verbose(
+            f"Set Async Redis Cache: key list: {cache_list}\nttl={ttl}, redis_version={self.redis_version}"
+        )
        try:
            async with _redis_client as redis_client:
                async with redis_client.pipeline(transaction=True) as pipe:
@ -238,18 +373,41 @@ class RedisCache(BaseCache):
                        print_verbose(
                            f"Set ASYNC Redis Cache PIPELINE: key: {cache_key}\nValue {cache_value}\nttl={ttl}"
                        )
+                        json_cache_value = json.dumps(cache_value)
                        # Set the value with a TTL if it's provided.
                        if ttl is not None:
-                            pipe.setex(cache_key, ttl, json.dumps(cache_value))
+                            pipe.setex(cache_key, ttl, json_cache_value)
                        else:
-                            pipe.set(cache_key, json.dumps(cache_value))
+                            pipe.set(cache_key, json_cache_value)
                    # Execute the pipeline and return the results.
                    results = await pipe.execute()

            print_verbose(f"pipeline results: {results}")
            # Optionally, you could process 'results' to make sure that all set operations were successful.
+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_success_hook(
+                    service=ServiceTypes.REDIS,
+                    duration=_duration,
+                    call_type="async_set_cache_pipeline",
+                )
+            )
            return results
        except Exception as e:
+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_failure_hook(
+                    service=ServiceTypes.REDIS,
+                    duration=_duration,
+                    error=e,
+                    call_type="async_set_cache_pipeline",
+                )
+            )
+
            verbose_logger.error(
                "LiteLLM Redis Caching: async set_cache_pipeline() - Got exception from REDIS %s, Writing value=%s",
                str(e),
@ -264,20 +422,44 @@ class RedisCache(BaseCache):
        key = self.check_and_fix_namespace(key=key)
        self.redis_batch_writing_buffer.append((key, value))
        if len(self.redis_batch_writing_buffer) >= self.redis_flush_size:
-            await self.flush_cache_buffer()
+            await self.flush_cache_buffer()  # logging done in here

-    async def async_increment(self, key, value: int, **kwargs):
+    async def async_increment(self, key, value: float, **kwargs) -> float:
        _redis_client = self.init_async_client()
+        start_time = time.time()
        try:
            async with _redis_client as redis_client:
-                await redis_client.incr(name=key, amount=value)
+                result = await redis_client.incrbyfloat(name=key, amount=value)
+                ## LOGGING ##
+                end_time = time.time()
+                _duration = end_time - start_time
+                asyncio.create_task(
+                    self.service_logger_obj.async_service_success_hook(
+                        service=ServiceTypes.REDIS,
+                        duration=_duration,
+                        call_type="async_increment",
+                    )
+                )
+                return result
        except Exception as e:
+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_failure_hook(
+                    service=ServiceTypes.REDIS,
+                    duration=_duration,
+                    error=e,
+                    call_type="async_increment",
+                )
+            )
            verbose_logger.error(
                "LiteLLM Redis Caching: async async_increment() - Got exception from REDIS %s, Writing value=%s",
                str(e),
                value,
            )
            traceback.print_exc()
+            raise e

    async def flush_cache_buffer(self):
        print_verbose(
@ -345,6 +527,7 @@ class RedisCache(BaseCache):
    async def async_get_cache(self, key, **kwargs):
        _redis_client = self.init_async_client()
        key = self.check_and_fix_namespace(key=key)
+        start_time = time.time()
        async with _redis_client as redis_client:
            try:
                print_verbose(f"Get Async Redis Cache: key: {key}")
@ -353,8 +536,29 @@ class RedisCache(BaseCache):
                    f"Got Async Redis Cache: key: {key}, cached_response {cached_response}"
                )
                response = self._get_cache_logic(cached_response=cached_response)
+                ## LOGGING ##
+                end_time = time.time()
+                _duration = end_time - start_time
+                asyncio.create_task(
+                    self.service_logger_obj.async_service_success_hook(
+                        service=ServiceTypes.REDIS,
+                        duration=_duration,
+                        call_type="async_get_cache",
+                    )
+                )
                return response
            except Exception as e:
+                ## LOGGING ##
+                end_time = time.time()
+                _duration = end_time - start_time
+                asyncio.create_task(
+                    self.service_logger_obj.async_service_failure_hook(
+                        service=ServiceTypes.REDIS,
+                        duration=_duration,
+                        error=e,
+                        call_type="async_get_cache",
+                    )
+                )
                # NON blocking - notify users Redis is throwing an exception
                print_verbose(
                    f"LiteLLM Caching: async get() - Got exception from REDIS: {str(e)}"
@ -366,6 +570,7 @@ class RedisCache(BaseCache):
        """
        _redis_client = await self.init_async_client()
        key_value_dict = {}
+        start_time = time.time()
        try:
            async with _redis_client as redis_client:
                _keys = []
@ -374,29 +579,110 @@ class RedisCache(BaseCache):
                    _keys.append(cache_key)
                results = await redis_client.mget(keys=_keys)

+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_success_hook(
+                    service=ServiceTypes.REDIS,
+                    duration=_duration,
+                    call_type="async_batch_get_cache",
+                )
+            )
+
            # Associate the results back with their keys.
            # 'results' is a list of values corresponding to the order of keys in 'key_list'.
            key_value_dict = dict(zip(key_list, results))

-            decoded_results = {
-                k.decode("utf-8"): self._get_cache_logic(v)
-                for k, v in key_value_dict.items()
-            }
+            decoded_results = {}
+            for k, v in key_value_dict.items():
+                if isinstance(k, bytes):
+                    k = k.decode("utf-8")
+                v = self._get_cache_logic(v)
+                decoded_results[k] = v

            return decoded_results
        except Exception as e:
+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_failure_hook(
+                    service=ServiceTypes.REDIS,
+                    duration=_duration,
+                    error=e,
+                    call_type="async_batch_get_cache",
+                )
+            )
            print_verbose(f"Error occurred in pipeline read - {str(e)}")
            return key_value_dict

-    async def ping(self):
+    def sync_ping(self) -> bool:
+        """
+        Tests if the sync redis client is correctly setup.
+        """
+        print_verbose(f"Pinging Sync Redis Cache")
+        start_time = time.time()
+        try:
+            response = self.redis_client.ping()
+            print_verbose(f"Redis Cache PING: {response}")
+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            self.service_logger_obj.service_success_hook(
+                service=ServiceTypes.REDIS,
+                duration=_duration,
+                call_type="sync_ping",
+            )
+            return response
+        except Exception as e:
+            # NON blocking - notify users Redis is throwing an exception
+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            self.service_logger_obj.service_failure_hook(
+                service=ServiceTypes.REDIS,
+                duration=_duration,
+                error=e,
+                call_type="sync_ping",
+            )
+            print_verbose(
+                f"LiteLLM Redis Cache PING: - Got exception from REDIS : {str(e)}"
+            )
+            traceback.print_exc()
+            raise e
+
+    async def ping(self) -> bool:
        _redis_client = self.init_async_client()
+        start_time = time.time()
        async with _redis_client as redis_client:
            print_verbose(f"Pinging Async Redis Cache")
            try:
                response = await redis_client.ping()
-                print_verbose(f"Redis Cache PING: {response}")
+                ## LOGGING ##
+                end_time = time.time()
+                _duration = end_time - start_time
+                asyncio.create_task(
+                    self.service_logger_obj.async_service_success_hook(
+                        service=ServiceTypes.REDIS,
+                        duration=_duration,
+                        call_type="async_ping",
+                    )
+                )
+                return response
            except Exception as e:
                # NON blocking - notify users Redis is throwing an exception
+                ## LOGGING ##
+                end_time = time.time()
+                _duration = end_time - start_time
+                asyncio.create_task(
+                    self.service_logger_obj.async_service_failure_hook(
+                        service=ServiceTypes.REDIS,
+                        duration=_duration,
+                        error=e,
+                        call_type="async_ping",
+                    )
+                )
                print_verbose(
                    f"LiteLLM Redis Cache PING: - Got exception from REDIS : {str(e)}"
                )
@ -525,9 +811,7 @@ class RedisSemanticCache(BaseCache):

        # get the prompt
        messages = kwargs["messages"]
-        prompt = ""
-        for message in messages:
-            prompt += message["content"]
+        prompt = "".join(message["content"] for message in messages)

        # create an embedding for prompt
        embedding_response = litellm.embedding(
@ -562,9 +846,7 @@ class RedisSemanticCache(BaseCache):

        # get the messages
        messages = kwargs["messages"]
-        prompt = ""
-        for message in messages:
-            prompt += message["content"]
+        prompt = "".join(message["content"] for message in messages)

        # convert to embedding
        embedding_response = litellm.embedding(
@ -624,9 +906,7 @@ class RedisSemanticCache(BaseCache):

        # get the prompt
        messages = kwargs["messages"]
-        prompt = ""
-        for message in messages:
-            prompt += message["content"]
+        prompt = "".join(message["content"] for message in messages)
        # create an embedding for prompt
        router_model_names = (
            [m["model_name"] for m in llm_model_list]
@ -679,9 +959,7 @@ class RedisSemanticCache(BaseCache):

        # get the messages
        messages = kwargs["messages"]
-        prompt = ""
-        for message in messages:
-            prompt += message["content"]
+        prompt = "".join(message["content"] for message in messages)

        router_model_names = (
            [m["model_name"] for m in llm_model_list]
@ -927,6 +1205,30 @@ class DualCache(BaseCache):
        except Exception as e:
            print_verbose(e)

+    def increment_cache(
+        self, key, value: int, local_only: bool = False, **kwargs
+    ) -> int:
+        """
+        Key - the key in cache
+
+        Value - int - the value you want to increment by
+
+        Returns - int - the incremented value
+        """
+        try:
+            result: int = value
+            if self.in_memory_cache is not None:
+                result = self.in_memory_cache.increment_cache(key, value, **kwargs)
+
+            if self.redis_cache is not None and local_only == False:
+                result = self.redis_cache.increment_cache(key, value, **kwargs)
+
+            return result
+        except Exception as e:
+            print_verbose(f"LiteLLM Cache: Excepton async add_cache: {str(e)}")
+            traceback.print_exc()
+            raise e
+
    def get_cache(self, key, local_only: bool = False, **kwargs):
        # Try to fetch from in-memory cache first
        try:
@ -979,7 +1281,7 @@ class DualCache(BaseCache):
                        self.in_memory_cache.set_cache(key, redis_result[key], **kwargs)

                for key, value in redis_result.items():
-                    result[sublist_keys.index(key)] = value
+                    result[keys.index(key)] = value

            print_verbose(f"async batch get cache: cache result: {result}")
            return result
@ -1029,10 +1331,8 @@ class DualCache(BaseCache):
                    keys, **kwargs
                )

-                print_verbose(f"in_memory_result: {in_memory_result}")
                if in_memory_result is not None:
                    result = in_memory_result
-
            if None in result and self.redis_cache is not None and local_only == False:
                """
                - for the none values in the result
@ -1048,22 +1348,23 @@ class DualCache(BaseCache):

                if redis_result is not None:
                    # Update in-memory cache with the value from Redis
-                    for key in redis_result:
-                        await self.in_memory_cache.async_set_cache(
-                            key, redis_result[key], **kwargs
-                        )
+                    for key, value in redis_result.items():
+                        if value is not None:
+                            await self.in_memory_cache.async_set_cache(
+                                key, redis_result[key], **kwargs
+                            )
+                for key, value in redis_result.items():
+                    index = keys.index(key)
+                    result[index] = value

-                sublist_dict = dict(zip(sublist_keys, redis_result))
-
-                for key, value in sublist_dict.items():
-                    result[sublist_keys.index(key)] = value
-
-            print_verbose(f"async batch get cache: cache result: {result}")
            return result
        except Exception as e:
            traceback.print_exc()

    async def async_set_cache(self, key, value, local_only: bool = False, **kwargs):
+        print_verbose(
+            f"async set cache: cache key: {key}; local_only: {local_only}; value: {value}"
+        )
        try:
            if self.in_memory_cache is not None:
                await self.in_memory_cache.async_set_cache(key, value, **kwargs)
@ -1074,24 +1375,55 @@ class DualCache(BaseCache):
            print_verbose(f"LiteLLM Cache: Excepton async add_cache: {str(e)}")
            traceback.print_exc()

-    async def async_increment_cache(
-        self, key, value: int, local_only: bool = False, **kwargs
+    async def async_batch_set_cache(
+        self, cache_list: list, local_only: bool = False, **kwargs
    ):
        """
-        Key - the key in cache
-
-        Value - int - the value you want to increment by
+        Batch write values to the cache
        """
+        print_verbose(
+            f"async batch set cache: cache keys: {cache_list}; local_only: {local_only}"
+        )
        try:
            if self.in_memory_cache is not None:
-                await self.in_memory_cache.async_increment(key, value, **kwargs)
+                await self.in_memory_cache.async_set_cache_pipeline(
+                    cache_list=cache_list, **kwargs
+                )

            if self.redis_cache is not None and local_only == False:
-                await self.redis_cache.async_increment(key, value, **kwargs)
+                await self.redis_cache.async_set_cache_pipeline(
+                    cache_list=cache_list, ttl=kwargs.get("ttl", None)
+                )
        except Exception as e:
            print_verbose(f"LiteLLM Cache: Excepton async add_cache: {str(e)}")
            traceback.print_exc()

+    async def async_increment_cache(
+        self, key, value: float, local_only: bool = False, **kwargs
+    ) -> float:
+        """
+        Key - the key in cache
+
+        Value - float - the value you want to increment by
+
+        Returns - float - the incremented value
+        """
+        try:
+            result: float = value
+            if self.in_memory_cache is not None:
+                result = await self.in_memory_cache.async_increment(
+                    key, value, **kwargs
+                )
+
+            if self.redis_cache is not None and local_only == False:
+                result = await self.redis_cache.async_increment(key, value, **kwargs)
+
+            return result
+        except Exception as e:
+            print_verbose(f"LiteLLM Cache: Excepton async add_cache: {str(e)}")
+            traceback.print_exc()
+            raise e
+
    def flush_cache(self):
        if self.in_memory_cache is not None:
            self.in_memory_cache.flush_cache()
@ -1109,7 +1441,7 @@ class DualCache(BaseCache):
 class Cache:
    def __init__(
        self,
-        type: Optional[Literal["local", "redis", "redis-semantic", "s3"]] = "local",
+        type: Optional[Literal["local", "redis", "redis-semantic", "s3", "disk"]] = "local",
        host: Optional[str] = None,
        port: Optional[str] = None,
        password: Optional[str] = None,
@ -1152,13 +1484,14 @@ class Cache:
        redis_semantic_cache_use_async=False,
        redis_semantic_cache_embedding_model="text-embedding-ada-002",
        redis_flush_size=None,
+        disk_cache_dir=None,
        **kwargs,
    ):
        """
        Initializes the cache based on the given type.

        Args:
-            type (str, optional): The type of cache to initialize. Can be "local", "redis", "redis-semantic", or "s3". Defaults to "local".
+            type (str, optional): The type of cache to initialize. Can be "local", "redis", "redis-semantic", "s3" or "disk". Defaults to "local".
            host (str, optional): The host address for the Redis cache. Required if type is "redis".
            port (int, optional): The port number for the Redis cache. Required if type is "redis".
            password (str, optional): The password for the Redis cache. Required if type is "redis".
@ -1204,6 +1537,8 @@ class Cache:
                s3_path=s3_path,
                **kwargs,
            )
+        elif type == "disk":
+            self.cache = DiskCache(disk_cache_dir=disk_cache_dir)
        if "cache" not in litellm.input_callback:
            litellm.input_callback.append("cache")
        if "cache" not in litellm.success_callback:
@ -1575,8 +1910,86 @@ class Cache:
            await self.cache.disconnect()


+class DiskCache(BaseCache):
+    def __init__(self, disk_cache_dir: Optional[str] = None):
+        import diskcache as dc
+
+        # if users don't provider one, use the default litellm cache
+        if disk_cache_dir is None:
+            self.disk_cache = dc.Cache(".litellm_cache")
+        else:
+            self.disk_cache = dc.Cache(disk_cache_dir)
+
+    def set_cache(self, key, value, **kwargs):
+        print_verbose("DiskCache: set_cache")
+        if "ttl" in kwargs:
+            self.disk_cache.set(key, value, expire=kwargs["ttl"])
+        else:
+            self.disk_cache.set(key, value)
+
+    async def async_set_cache(self, key, value, **kwargs):
+        self.set_cache(key=key, value=value, **kwargs)
+
+    async def async_set_cache_pipeline(self, cache_list, ttl=None):
+        for cache_key, cache_value in cache_list:
+            if ttl is not None:
+                self.set_cache(key=cache_key, value=cache_value, ttl=ttl)
+            else:
+                self.set_cache(key=cache_key, value=cache_value)
+
+    def get_cache(self, key, **kwargs):
+        original_cached_response = self.disk_cache.get(key)
+        if original_cached_response:
+            try:
+                cached_response = json.loads(original_cached_response)
+            except:
+                cached_response = original_cached_response
+            return cached_response
+        return None
+
+    def batch_get_cache(self, keys: list, **kwargs):
+        return_val = []
+        for k in keys:
+            val = self.get_cache(key=k, **kwargs)
+            return_val.append(val)
+        return return_val
+
+    def increment_cache(self, key, value: int, **kwargs) -> int:
+        # get the value
+        init_value = self.get_cache(key=key) or 0
+        value = init_value + value
+        self.set_cache(key, value, **kwargs)
+        return value
+
+    async def async_get_cache(self, key, **kwargs):
+        return self.get_cache(key=key, **kwargs)
+
+    async def async_batch_get_cache(self, keys: list, **kwargs):
+        return_val = []
+        for k in keys:
+            val = self.get_cache(key=k, **kwargs)
+            return_val.append(val)
+        return return_val
+
+    async def async_increment(self, key, value: int, **kwargs) -> int:
+        # get the value
+        init_value = await self.async_get_cache(key=key) or 0
+        value = init_value + value
+        await self.async_set_cache(key, value, **kwargs)
+        return value
+
+    def flush_cache(self):
+        self.disk_cache.clear()
+
+    async def disconnect(self):
+        pass
+
+    def delete_cache(self, key):
+        self.disk_cache.pop(key)
+
+
 def enable_cache(
-    type: Optional[Literal["local", "redis", "s3"]] = "local",
+    type: Optional[Literal["local", "redis", "s3", "disk"]] = "local",
    host: Optional[str] = None,
    port: Optional[str] = None,
    password: Optional[str] = None,
@ -1605,7 +2018,7 @@ def enable_cache(
    Enable cache with the specified configuration.

    Args:
-        type (Optional[Literal["local", "redis"]]): The type of cache to enable. Defaults to "local".
+        type (Optional[Literal["local", "redis", "s3", "disk"]]): The type of cache to enable. Defaults to "local".
        host (Optional[str]): The host address of the cache server. Defaults to None.
        port (Optional[str]): The port number of the cache server. Defaults to None.
        password (Optional[str]): The password for the cache server. Defaults to None.
@ -1641,7 +2054,7 @@ def enable_cache(


 def update_cache(
-    type: Optional[Literal["local", "redis"]] = "local",
+    type: Optional[Literal["local", "redis", "s3", "disk"]] = "local",
    host: Optional[str] = None,
    port: Optional[str] = None,
    password: Optional[str] = None,
@ -1670,7 +2083,7 @@ def update_cache(
    Update the cache for LiteLLM.

    Args:
-        type (Optional[Literal["local", "redis"]]): The type of cache. Defaults to "local".
+        type (Optional[Literal["local", "redis", "s3", "disk"]]): The type of cache. Defaults to "local".
        host (Optional[str]): The host of the cache. Defaults to None.
        port (Optional[str]): The port of the cache. Defaults to None.
        password (Optional[str]): The password for the cache. Defaults to None.
--- a/litellm/exceptions.py
+++ b/litellm/exceptions.py
@ -9,55 +9,64 @@

 ## LiteLLM versions of the OpenAI Exception Types

-from openai import (
-    AuthenticationError,
-    BadRequestError,
-    NotFoundError,
-    RateLimitError,
-    APIStatusError,
-    OpenAIError,
-    APIError,
-    APITimeoutError,
-    APIConnectionError,
-    APIResponseValidationError,
-    UnprocessableEntityError,
-    PermissionDeniedError,
-)
+import openai
 import httpx
 from typing import Optional


-class AuthenticationError(AuthenticationError):  # type: ignore
-    def __init__(self, message, llm_provider, model, response: httpx.Response):
+class AuthenticationError(openai.AuthenticationError):  # type: ignore
+    def __init__(
+        self,
+        message,
+        llm_provider,
+        model,
+        response: httpx.Response,
+        litellm_debug_info: Optional[str] = None,
+    ):
        self.status_code = 401
        self.message = message
        self.llm_provider = llm_provider
        self.model = model
+        self.litellm_debug_info = litellm_debug_info
        super().__init__(
            self.message, response=response, body=None
        )  # Call the base class constructor with the parameters it needs


 # raise when invalid models passed, example gpt-8
-class NotFoundError(NotFoundError):  # type: ignore
-    def __init__(self, message, model, llm_provider, response: httpx.Response):
+class NotFoundError(openai.NotFoundError):  # type: ignore
+    def __init__(
+        self,
+        message,
+        model,
+        llm_provider,
+        response: httpx.Response,
+        litellm_debug_info: Optional[str] = None,
+    ):
        self.status_code = 404
        self.message = message
        self.model = model
        self.llm_provider = llm_provider
+        self.litellm_debug_info = litellm_debug_info
        super().__init__(
            self.message, response=response, body=None
        )  # Call the base class constructor with the parameters it needs


-class BadRequestError(BadRequestError):  # type: ignore
+class BadRequestError(openai.BadRequestError):  # type: ignore
    def __init__(
-        self, message, model, llm_provider, response: Optional[httpx.Response] = None
+        self,
+        message,
+        model,
+        llm_provider,
+        response: Optional[httpx.Response] = None,
+        litellm_debug_info: Optional[str] = None,
    ):
        self.status_code = 400
        self.message = message
        self.model = model
        self.llm_provider = llm_provider
+        self.litellm_debug_info = litellm_debug_info
        response = response or httpx.Response(
            status_code=self.status_code,
            request=httpx.Request(
@ -69,46 +78,77 @@ class BadRequestError(BadRequestError):  # type: ignore
        )  # Call the base class constructor with the parameters it needs


-class UnprocessableEntityError(UnprocessableEntityError):  # type: ignore
-    def __init__(self, message, model, llm_provider, response: httpx.Response):
+class UnprocessableEntityError(openai.UnprocessableEntityError):  # type: ignore
+    def __init__(
+        self,
+        message,
+        model,
+        llm_provider,
+        response: httpx.Response,
+        litellm_debug_info: Optional[str] = None,
+    ):
        self.status_code = 422
        self.message = message
        self.model = model
        self.llm_provider = llm_provider
+        self.litellm_debug_info = litellm_debug_info
        super().__init__(
            self.message, response=response, body=None
        )  # Call the base class constructor with the parameters it needs


-class Timeout(APITimeoutError):  # type: ignore
-    def __init__(self, message, model, llm_provider):
-        self.status_code = 408
-        self.message = message
-        self.model = model
-        self.llm_provider = llm_provider
+class Timeout(openai.APITimeoutError):  # type: ignore
+    def __init__(
+        self, message, model, llm_provider, litellm_debug_info: Optional[str] = None
+    ):
        request = httpx.Request(method="POST", url="https://api.openai.com/v1")
        super().__init__(
            request=request
        )  # Call the base class constructor with the parameters it needs
+        self.status_code = 408
+        self.message = message
+        self.model = model
+        self.llm_provider = llm_provider
+        self.litellm_debug_info = litellm_debug_info
+
+    # custom function to convert to str
+    def __str__(self):
+        return str(self.message)


-class PermissionDeniedError(PermissionDeniedError):  # type:ignore
-    def __init__(self, message, llm_provider, model, response: httpx.Response):
+class PermissionDeniedError(openai.PermissionDeniedError):  # type:ignore
+    def __init__(
+        self,
+        message,
+        llm_provider,
+        model,
+        response: httpx.Response,
+        litellm_debug_info: Optional[str] = None,
+    ):
        self.status_code = 403
        self.message = message
        self.llm_provider = llm_provider
        self.model = model
+        self.litellm_debug_info = litellm_debug_info
        super().__init__(
            self.message, response=response, body=None
        )  # Call the base class constructor with the parameters it needs


-class RateLimitError(RateLimitError):  # type: ignore
-    def __init__(self, message, llm_provider, model, response: httpx.Response):
+class RateLimitError(openai.RateLimitError):  # type: ignore
+    def __init__(
+        self,
+        message,
+        llm_provider,
+        model,
+        response: httpx.Response,
+        litellm_debug_info: Optional[str] = None,
+    ):
        self.status_code = 429
        self.message = message
        self.llm_provider = llm_provider
        self.modle = model
+        self.litellm_debug_info = litellm_debug_info
        super().__init__(
            self.message, response=response, body=None
        )  # Call the base class constructor with the parameters it needs
@ -116,11 +156,19 @@ class RateLimitError(RateLimitError):  # type: ignore

 # sub class of rate limit error - meant to give more granularity for error handling context window exceeded errors
 class ContextWindowExceededError(BadRequestError):  # type: ignore
-    def __init__(self, message, model, llm_provider, response: httpx.Response):
+    def __init__(
+        self,
+        message,
+        model,
+        llm_provider,
+        response: httpx.Response,
+        litellm_debug_info: Optional[str] = None,
+    ):
        self.status_code = 400
        self.message = message
        self.model = model
        self.llm_provider = llm_provider
+        self.litellm_debug_info = litellm_debug_info
        super().__init__(
            message=self.message,
            model=self.model,  # type: ignore
@ -131,11 +179,19 @@ class ContextWindowExceededError(BadRequestError):  # type: ignore

 class ContentPolicyViolationError(BadRequestError):  # type: ignore
    #  Error code: 400 - {'error': {'code': 'content_policy_violation', 'message': 'Your request was rejected as a result of our safety system. Image descriptions generated from your prompt may contain text that is not allowed by our safety system. If you believe this was done in error, your request may succeed if retried, or by adjusting your prompt.', 'param': None, 'type': 'invalid_request_error'}}
-    def __init__(self, message, model, llm_provider, response: httpx.Response):
+    def __init__(
+        self,
+        message,
+        model,
+        llm_provider,
+        response: httpx.Response,
+        litellm_debug_info: Optional[str] = None,
+    ):
        self.status_code = 400
        self.message = message
        self.model = model
        self.llm_provider = llm_provider
+        self.litellm_debug_info = litellm_debug_info
        super().__init__(
            message=self.message,
            model=self.model,  # type: ignore
@ -144,51 +200,77 @@ class ContentPolicyViolationError(BadRequestError):  # type: ignore
        )  # Call the base class constructor with the parameters it needs


-class ServiceUnavailableError(APIStatusError):  # type: ignore
-    def __init__(self, message, llm_provider, model, response: httpx.Response):
+class ServiceUnavailableError(openai.APIStatusError):  # type: ignore
+    def __init__(
+        self,
+        message,
+        llm_provider,
+        model,
+        response: httpx.Response,
+        litellm_debug_info: Optional[str] = None,
+    ):
        self.status_code = 503
        self.message = message
        self.llm_provider = llm_provider
        self.model = model
+        self.litellm_debug_info = litellm_debug_info
        super().__init__(
            self.message, response=response, body=None
        )  # Call the base class constructor with the parameters it needs


 # raise this when the API returns an invalid response object - https://github.com/openai/openai-python/blob/1be14ee34a0f8e42d3f9aa5451aa4cb161f1781f/openai/api_requestor.py#L401
-class APIError(APIError):  # type: ignore
+class APIError(openai.APIError):  # type: ignore
    def __init__(
-        self, status_code, message, llm_provider, model, request: httpx.Request
+        self,
+        status_code,
+        message,
+        llm_provider,
+        model,
+        request: httpx.Request,
+        litellm_debug_info: Optional[str] = None,
    ):
        self.status_code = status_code
        self.message = message
        self.llm_provider = llm_provider
        self.model = model
+        self.litellm_debug_info = litellm_debug_info
        super().__init__(self.message, request=request, body=None)  # type: ignore


 # raised if an invalid request (not get, delete, put, post) is made
-class APIConnectionError(APIConnectionError):  # type: ignore
-    def __init__(self, message, llm_provider, model, request: httpx.Request):
+class APIConnectionError(openai.APIConnectionError):  # type: ignore
+    def __init__(
+        self,
+        message,
+        llm_provider,
+        model,
+        request: httpx.Request,
+        litellm_debug_info: Optional[str] = None,
+    ):
        self.message = message
        self.llm_provider = llm_provider
        self.model = model
        self.status_code = 500
+        self.litellm_debug_info = litellm_debug_info
        super().__init__(message=self.message, request=request)


 # raised if an invalid request (not get, delete, put, post) is made
-class APIResponseValidationError(APIResponseValidationError):  # type: ignore
-    def __init__(self, message, llm_provider, model):
+class APIResponseValidationError(openai.APIResponseValidationError):  # type: ignore
+    def __init__(
+        self, message, llm_provider, model, litellm_debug_info: Optional[str] = None
+    ):
        self.message = message
        self.llm_provider = llm_provider
        self.model = model
        request = httpx.Request(method="POST", url="https://api.openai.com/v1")
        response = httpx.Response(status_code=500, request=request)
+        self.litellm_debug_info = litellm_debug_info
        super().__init__(response=response, body=None, message=message)


-class OpenAIError(OpenAIError):  # type: ignore
+class OpenAIError(openai.OpenAIError):  # type: ignore
    def __init__(self, original_exception):
        self.status_code = original_exception.http_status
        super().__init__(
@ -210,7 +292,7 @@ class BudgetExceededError(Exception):


 ## DEPRECATED ##
-class InvalidRequestError(BadRequestError):  # type: ignore
+class InvalidRequestError(openai.BadRequestError):  # type: ignore
    def __init__(self, message, model, llm_provider):
        self.status_code = 400
        self.message = message
--- a/litellm/integrations/aispend.py
+++ b/litellm/integrations/aispend.py
@ -1,9 +1,6 @@
 #### What this does ####
 #    On success + failure, log events to aispend.io
 import dotenv, os
-import requests
-
-dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
 import datetime

--- a/Show more
+++ b/Show more