Merge branch 'main' into patch-1

2025-04-24 10:14:26 +00:00 · 2025-02-24 16:37:25 +03:30 · 2025-02-24 16:37:25 +03:30 · 095206cf0e
commit 095206cf0e
parent aed954c49a 566d9354aa
1230 changed files with 107036 additions and 36584 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
--- a/.circleci/requirements.txt
+++ b/.circleci/requirements.txt
@ -9,3 +9,5 @@ anthropic
 orjson==3.9.15
 pydantic==2.7.1
 google-cloud-aiplatform==1.43.0
+fastapi-sso==0.10.0
+uvloop==0.21.0
--- a/.dockerignore
+++ b/.dockerignore
@ -9,3 +9,4 @@ tests
 .devcontainer
 *.tgz
 log.txt
+docker/Dockerfile.*
--- a/.env.example
+++ b/.env.example
@ -20,3 +20,8 @@ REPLICATE_API_TOKEN = ""
 ANTHROPIC_API_KEY = ""
 # Infisical
 INFISICAL_TOKEN = ""
+
+# Development Configs
+LITELLM_MASTER_KEY = "sk-1234"
+DATABASE_URL = "postgresql://llmproxy:dbpassword9090@db:5432/litellm"
+STORE_MODEL_IN_DB = "True"
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@ -22,7 +22,7 @@

 <!-- List of changes -->

-## [REQUIRED] Testing - Attach a screenshot of any new tests passing locall
+## [REQUIRED] Testing - Attach a screenshot of any new tests passing locally
 If UI changes, send a screenshot/GIF of working UI fixes

 <!-- Test procedure -->
--- a/.github/workflows/interpret_load_test.py
+++ b/.github/workflows/interpret_load_test.py
@ -52,6 +52,39 @@ def interpret_results(csv_file):
    return markdown_table


+def _get_docker_run_command_stable_release(release_version):
+    return f"""
+    \n\n
+    ## Docker Run LiteLLM Proxy
+
+    ```
+    docker run \\
+    -e STORE_MODEL_IN_DB=True \\
+    -p 4000:4000 \\
+    ghcr.io/berriai/litellm_stable_release_branch-{release_version}
+    """
+
+
+def _get_docker_run_command(release_version):
+    return f"""
+    \n\n
+    ## Docker Run LiteLLM Proxy
+
+    ```
+    docker run \\
+    -e STORE_MODEL_IN_DB=True \\
+    -p 4000:4000 \\
+    ghcr.io/berriai/litellm:main-{release_version}
+    """
+
+
+def get_docker_run_command(release_version):
+    if "stable" in release_version:
+        return _get_docker_run_command_stable_release(release_version)
+    else:
+        return _get_docker_run_command(release_version)
+
+
 if __name__ == "__main__":
    csv_file = "load_test_stats.csv"  # Change this to the path of your CSV file
    markdown_table = interpret_results(csv_file)
@ -79,17 +112,7 @@ if __name__ == "__main__":
        start_index = latest_release.body.find("Load Test LiteLLM Proxy Results")
        existing_release_body = latest_release.body[:start_index]

-    docker_run_command = f"""
-\n\n
-## Docker Run LiteLLM Proxy
-
-```
-docker run \\
-e STORE_MODEL_IN_DB=True \\
-p 4000:4000 \\
-ghcr.io/berriai/litellm:main-{release_version}
-```
-    """
+    docker_run_command = get_docker_run_command(release_version)
    print("docker run command: ", docker_run_command)

    new_release_body = (
--- a/.github/workflows/locustfile.py
+++ b/.github/workflows/locustfile.py
@ -1,6 +1,4 @@
-from locust import HttpUser, task, between, events
-import json
-import time
+from locust import HttpUser, task, between


 class MyUser(HttpUser):
@ -10,7 +8,7 @@ class MyUser(HttpUser):
    def chat_completion(self):
        headers = {
            "Content-Type": "application/json",
-            "Authorization": f"Bearer sk-ZoHqrLIs2-5PzJrqBaviAA",
+            "Authorization": "Bearer sk-ZoHqrLIs2-5PzJrqBaviAA",
            # Include any additional headers you may need for authentication, etc.
        }

--- a/.github/workflows/reset_stable.yml
+++ b/.github/workflows/reset_stable.yml
@ -0,0 +1,39 @@
+name: Reset litellm_stable branch
+
+on:
+  release:
+    types: [published, created]
+jobs:
+  update-stable-branch:
+    if: ${{ startsWith(github.event.release.tag_name, 'v') && !endsWith(github.event.release.tag_name, '-stable') }}
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+
+      - name: Reset litellm_stable_release_branch branch to the release commit
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          # Configure Git user
+          git config user.name "github-actions[bot]"
+          git config user.email "github-actions[bot]@users.noreply.github.com"
+          
+          # Fetch all branches and tags
+          git fetch --all
+
+          # Check if the litellm_stable_release_branch branch exists
+          if git show-ref --verify --quiet refs/remotes/origin/litellm_stable_release_branch; then
+            echo "litellm_stable_release_branch branch exists."
+            git checkout litellm_stable_release_branch
+          else
+            echo "litellm_stable_release_branch branch does not exist. Creating it."
+            git checkout -b litellm_stable_release_branch
+          fi
+
+          # Reset litellm_stable_release_branch branch to the release commit
+          git reset --hard $GITHUB_SHA
+
+          # Push the updated litellm_stable_release_branch branch
+          git push origin litellm_stable_release_branch --force
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@ -0,0 +1,20 @@
+name: "Stale Issue Management"
+
+on:
+  schedule:
+    - cron: '0 0 * * *' # Runs daily at midnight UTC
+  workflow_dispatch:
+
+jobs:
+  stale:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/stale@v8
+        with:
+          repo-token: "${{ secrets.GITHUB_TOKEN }}"
+          stale-issue-message: "This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs."
+          stale-pr-message: "This pull request has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs."
+          days-before-stale: 90        # Revert to 60 days
+          days-before-close: 7         # Revert to 7 days
+          stale-issue-label: "stale"
+          operations-per-run: 1000
--- a/.gitignore
+++ b/.gitignore
@ -48,7 +48,7 @@ deploy/charts/litellm/charts/*
 deploy/charts/*.tgz
 litellm/proxy/vertex_key.json
 **/.vim/
-/node_modules
+**/node_modules
 kub.yaml
 loadtest_kub.yaml
 litellm/proxy/_new_secret_config.yaml
@ -66,3 +66,14 @@ litellm/tests/langfuse.log
 litellm/tests/langfuse.log
 litellm/proxy/google-cloud-sdk/*
 tests/llm_translation/log.txt
+venv/
+tests/local_testing/log.txt
+
+.codegpt
+litellm/proxy/_new_new_secret_config.yaml
+litellm/proxy/custom_guardrail.py
+litellm/proxy/_experimental/out/404.html
+litellm/proxy/_experimental/out/404.html
+litellm/proxy/_experimental/out/model_hub.html
+.mypy_cache/*
+litellm/proxy/application.log
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -22,7 +22,7 @@ repos:
    rev: 7.0.0  # The version of flake8 to use
    hooks:
    -  id: flake8
-       exclude: ^litellm/tests/|^litellm/proxy/tests/
+       exclude: ^litellm/tests/|^litellm/proxy/tests/|^litellm/tests/litellm/|^tests/litellm/
       additional_dependencies: [flake8-print]
       files: litellm/.*\.py
    # -  id: flake8
--- a/23
+++ b/23
@ -1,18 +1,20 @@
 # Base image for building
-ARG LITELLM_BUILD_IMAGE=python:3.11.8-slim
+ARG LITELLM_BUILD_IMAGE=cgr.dev/chainguard/python:latest-dev

 # Runtime image
-ARG LITELLM_RUNTIME_IMAGE=python:3.11.8-slim
+ARG LITELLM_RUNTIME_IMAGE=cgr.dev/chainguard/python:latest-dev
 # Builder stage
 FROM $LITELLM_BUILD_IMAGE AS builder

 # Set the working directory to /app
 WORKDIR /app

+USER root
+
 # Install build dependencies
-RUN apt-get clean && apt-get update && \
-    apt-get install -y gcc python3-dev && \
-    rm -rf /var/lib/apt/lists/*
+RUN apk update && \
+    apk add --no-cache gcc python3-dev openssl openssl-dev
+

 RUN pip install --upgrade pip && \
    pip install build
@ -49,8 +51,12 @@ RUN chmod +x docker/build_admin_ui.sh && ./docker/build_admin_ui.sh
 # Runtime stage
 FROM $LITELLM_RUNTIME_IMAGE AS runtime

-# Update dependencies and clean up - handles debian security issue
-RUN apt-get update && apt-get upgrade -y && rm -rf /var/lib/apt/lists/* 
+# Ensure runtime stage runs as root
+USER root
+
+# Install runtime dependencies
+RUN apk update && \
+    apk add --no-cache openssl

 WORKDIR /app
 # Copy the current directory contents into the container at /app
@ -67,10 +73,11 @@ RUN pip install *.whl /wheels/* --no-index --find-links=/wheels/ && rm -f *.whl
 # Generate prisma client
 RUN prisma generate
 RUN chmod +x docker/entrypoint.sh
+RUN chmod +x docker/prod_entrypoint.sh

 EXPOSE 4000/tcp

-ENTRYPOINT ["litellm"]
+ENTRYPOINT ["docker/prod_entrypoint.sh"]

 # Append "--detailed_debug" to the end of CMD to view detailed debug logs 
 CMD ["--port", "4000"]
--- a/README.md
+++ b/README.md
@ -64,18 +64,54 @@ import os

 ## set ENV variables
 os.environ["OPENAI_API_KEY"] = "your-openai-key"
-os.environ["COHERE_API_KEY"] = "your-cohere-key"
+os.environ["ANTHROPIC_API_KEY"] = "your-anthropic-key"

 messages = [{ "content": "Hello, how are you?","role": "user"}]

 # openai call
-response = completion(model="gpt-3.5-turbo", messages=messages)
+response = completion(model="openai/gpt-4o", messages=messages)

-# cohere call
-response = completion(model="command-nightly", messages=messages)
+# anthropic call
+response = completion(model="anthropic/claude-3-sonnet-20240229", messages=messages)
 print(response)
 ```

+### Response (OpenAI Format)
+
+```json
+{
+    "id": "chatcmpl-565d891b-a42e-4c39-8d14-82a1f5208885",
+    "created": 1734366691,
+    "model": "claude-3-sonnet-20240229",
+    "object": "chat.completion",
+    "system_fingerprint": null,
+    "choices": [
+        {
+            "finish_reason": "stop",
+            "index": 0,
+            "message": {
+                "content": "Hello! As an AI language model, I don't have feelings, but I'm operating properly and ready to assist you with any questions or tasks you may have. How can I help you today?",
+                "role": "assistant",
+                "tool_calls": null,
+                "function_call": null
+            }
+        }
+    ],
+    "usage": {
+        "completion_tokens": 43,
+        "prompt_tokens": 13,
+        "total_tokens": 56,
+        "completion_tokens_details": null,
+        "prompt_tokens_details": {
+            "audio_tokens": null,
+            "cached_tokens": 0
+        },
+        "cache_creation_input_tokens": 0,
+        "cache_read_input_tokens": 0
+    }
+}
+```
+
 Call any model supported by a provider, with `model=<provider_name>/<model_name>`. There might be provider-specific details here, so refer to [provider docs for more information](https://docs.litellm.ai/docs/providers)

 ## Async ([Docs](https://docs.litellm.ai/docs/completion/stream#async-completion))
@ -87,7 +123,7 @@ import asyncio
 async def test_get_response():
    user_message = "Hello, how are you?"
    messages = [{"content": user_message, "role": "user"}]
-    response = await acompletion(model="gpt-3.5-turbo", messages=messages)
+    response = await acompletion(model="openai/gpt-4o", messages=messages)
    return response

 response = asyncio.run(test_get_response())
@ -101,37 +137,63 @@ Streaming is supported for all models (Bedrock, Huggingface, TogetherAI, Azure,

 ```python
 from litellm import completion
-response = completion(model="gpt-3.5-turbo", messages=messages, stream=True)
+response = completion(model="openai/gpt-4o", messages=messages, stream=True)
 for part in response:
    print(part.choices[0].delta.content or "")

 # claude 2
-response = completion('claude-2', messages, stream=True)
+response = completion('anthropic/claude-3-sonnet-20240229', messages, stream=True)
 for part in response:
-    print(part.choices[0].delta.content or "")
+    print(part)
+```
+
+### Response chunk (OpenAI Format)
+
+```json
+{
+    "id": "chatcmpl-2be06597-eb60-4c70-9ec5-8cd2ab1b4697",
+    "created": 1734366925,
+    "model": "claude-3-sonnet-20240229",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": null,
+    "choices": [
+        {
+            "finish_reason": null,
+            "index": 0,
+            "delta": {
+                "content": "Hello",
+                "role": "assistant",
+                "function_call": null,
+                "tool_calls": null,
+                "audio": null
+            },
+            "logprobs": null
+        }
+    ]
+}
 ```

 ## Logging Observability ([Docs](https://docs.litellm.ai/docs/observability/callbacks))

-LiteLLM exposes pre defined callbacks to send data to Lunary, Langfuse, DynamoDB, s3 Buckets, Helicone, Promptlayer, Traceloop, Athina, Slack, MLflow
+LiteLLM exposes pre defined callbacks to send data to Lunary, MLflow, Langfuse, DynamoDB, s3 Buckets, Helicone, Promptlayer, Traceloop, Athina, Slack

 ```python
 from litellm import completion

-## set env variables for logging tools
+## set env variables for logging tools (when using MLflow, no API key set up is required)
 os.environ["LUNARY_PUBLIC_KEY"] = "your-lunary-public-key"
 os.environ["HELICONE_API_KEY"] = "your-helicone-auth-key"
 os.environ["LANGFUSE_PUBLIC_KEY"] = ""
 os.environ["LANGFUSE_SECRET_KEY"] = ""
 os.environ["ATHINA_API_KEY"] = "your-athina-api-key"

-os.environ["OPENAI_API_KEY"]
+os.environ["OPENAI_API_KEY"] = "your-openai-key"

 # set callbacks
-litellm.success_callback = ["lunary", "langfuse", "athina", "helicone"] # log input/output to lunary, langfuse, supabase, athina, helicone etc
+litellm.success_callback = ["lunary", "mlflow", "langfuse", "athina", "helicone"] # log input/output to lunary, langfuse, supabase, athina, helicone etc

 #openai call
-response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])
+response = completion(model="openai/gpt-4o", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])
 ```

 # LiteLLM Proxy Server (LLM Gateway) - ([Docs](https://docs.litellm.ai/docs/simple_proxy))
@ -200,7 +262,7 @@ echo 'LITELLM_MASTER_KEY="sk-1234"' > .env

 # Add the litellm salt key - you cannot change this after adding a model
 # It is used to encrypt / decrypt your LLM API Key credentials
-# We recommned - https://1password.com/password-generator/ 
+# We recommend - https://1password.com/password-generator/ 
 # password generator to get a random hash for litellm salt key
 echo 'LITELLM_SALT_KEY="sk-1234"' > .env

@ -241,6 +303,7 @@ curl 'http://0.0.0.0:4000/key/generate' \
 |-------------------------------------------------------------------------------------|---------------------------------------------------------|---------------------------------------------------------------------------------|-------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------|-------------------------------------------------------------------------------|-------------------------------------------------------------------------|
 | [openai](https://docs.litellm.ai/docs/providers/openai)                             | ✅                                                       | ✅                                                                               | ✅                                                                                   | ✅                                                                                 | ✅                                                                             | ✅                                                                       |
 | [azure](https://docs.litellm.ai/docs/providers/azure)                               | ✅                                                       | ✅                                                                               | ✅                                                                                   | ✅                                                                                 | ✅                                                                             | ✅                                                                       |
+| [AI/ML API](https://docs.litellm.ai/docs/providers/aiml)                               | ✅                                                       | ✅                                                                               | ✅                                                                                   | ✅                                                                                 | ✅                                                                             | ✅                                                                       |
 | [aws - sagemaker](https://docs.litellm.ai/docs/providers/aws_sagemaker)             | ✅                                                       | ✅                                                                               | ✅                                                                                   | ✅                                                                                 | ✅                                                                             |                                                                         |
 | [aws - bedrock](https://docs.litellm.ai/docs/providers/bedrock)                     | ✅                                                       | ✅                                                                               | ✅                                                                                   | ✅                                                                                 | ✅                                                                             |                                                                         |
 | [google - vertex_ai](https://docs.litellm.ai/docs/providers/vertex)                 | ✅                                                       | ✅                                                                               | ✅                                                                                   | ✅                                                                                 | ✅                                                                             | ✅                                                                       |
@ -280,25 +343,32 @@ curl 'http://0.0.0.0:4000/key/generate' \
 To contribute: Clone the repo locally -> Make a change -> Submit a PR with the change.

 Here's how to modify the repo locally:
+
 Step 1: Clone the repo

 ```
 git clone https://github.com/BerriAI/litellm.git
 ```

-Step 2: Navigate into the project, and install dependencies:
+Step 2: Install dependencies:

 ```
-cd litellm
-poetry install -E extra_proxy -E proxy
+pip install -r requirements.txt
 ```

 Step 3: Test your change:

+a. Add a pytest test within `tests/litellm/`
+
+This folder follows the same directory structure as `litellm/`.
+
+If a corresponding test file does not exist, create one.
+
+b. Run the test
+
 ```
-cd litellm/tests # pwd: Documents/litellm/litellm/tests
-poetry run flake8
-poetry run pytest .
+cd tests/litellm # pwd: Documents/litellm/litellm/tests/litellm
+pytest /path/to/test_file.py
 ```

 Step 4: Submit a PR with your changes! 🚀
@ -388,3 +458,20 @@ If you have suggestions on how to improve the code quality feel free to open an
 <a href="https://github.com/BerriAI/litellm/graphs/contributors">
  <img src="https://contrib.rocks/image?repo=BerriAI/litellm" />
 </a>
+
+
+## Run in Developer mode
+### Services
+1. Setup .env file in root
+2. Run dependant services `docker-compose up db prometheus`
+
+### Backend
+1. (In root) create virtual environment `python -m venv .venv`
+2. Activate virtual environment `source .venv/bin/activate`
+3. Install dependencies `pip install -e ".[all]"`
+4. Start proxy backend `uvicorn litellm.proxy.proxy_server:app --host localhost --port 4000 --reload`
+
+### Frontend
+1. Navigate to `ui/litellm-dashboard`
+2. Install dependencies `npm install`
+3. Run `npm run dev` to start the dashboard
--- a/cookbook/Benchmarking_LLMs_by_use_case.ipynb
+++ b/cookbook/Benchmarking_LLMs_by_use_case.ipynb
--- a/cookbook/Evaluating_LLMs.ipynb
+++ b/cookbook/Evaluating_LLMs.ipynb
--- a/cookbook/LiteLLM_Azure_and_OpenAI_example.ipynb
+++ b/cookbook/LiteLLM_Azure_and_OpenAI_example.ipynb
@ -1,423 +1,422 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    }
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "BmX0b5Ueh91v"
+   },
+   "source": [
+    "# LiteLLM - Azure OpenAI + OpenAI Calls\n",
+    "This notebook covers the following for Azure OpenAI + OpenAI:\n",
+    "* Completion - Quick start\n",
+    "* Completion - Streaming\n",
+    "* Completion - Azure, OpenAI in separate threads\n",
+    "* Completion - Stress Test 10 requests in parallel\n",
+    "* Completion - Azure, OpenAI in the same thread"
+   ]
  },
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "source": [
-        "# LiteLLM - Azure OpenAI + OpenAI Calls\n",
-        "This notebook covers the following for Azure OpenAI + OpenAI:\n",
-        "* Completion - Quick start\n",
-        "* Completion - Streaming\n",
-        "* Completion - Azure, OpenAI in separate threads\n",
-        "* Completion - Stress Test 10 requests in parallel\n",
-        "* Completion - Azure, OpenAI in the same thread"
-      ],
-      "metadata": {
-        "id": "BmX0b5Ueh91v"
-      }
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "iHq4d0dpfawS"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install litellm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "id": "mnveHO5dfcB0"
+   },
+   "outputs": [],
+   "source": [
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "eo88QUdbiDIE"
+   },
+   "source": [
+    "## Completion - Quick start"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
    },
+    "id": "5OSosWNCfc_2",
+    "outputId": "c52344b1-2458-4695-a7eb-a9b076893348"
+   },
+   "outputs": [
    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "iHq4d0dpfawS"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install litellm"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "import os, litellm"
-      ],
-      "metadata": {
-        "id": "mnveHO5dfcB0"
-      },
-      "execution_count": 2,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## Completion - Quick start"
-      ],
-      "metadata": {
-        "id": "eo88QUdbiDIE"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "import os\n",
-        "from litellm import completion\n",
-        "\n",
-        "# openai configs\n",
-        "os.environ[\"OPENAI_API_KEY\"] = \"\"\n",
-        "\n",
-        "# azure openai configs\n",
-        "os.environ[\"AZURE_API_KEY\"] = \"\"\n",
-        "os.environ[\"AZURE_API_BASE\"] = \"https://openai-gpt-4-test-v-1.openai.azure.com/\"\n",
-        "os.environ[\"AZURE_API_VERSION\"] = \"2023-05-15\"\n",
-        "\n",
-        "\n",
-        "# openai call\n",
-        "response = completion(\n",
-        "    model = \"gpt-3.5-turbo\",\n",
-        "    messages = [{ \"content\": \"Hello, how are you?\",\"role\": \"user\"}]\n",
-        ")\n",
-        "print(\"Openai Response\\n\")\n",
-        "print(response)\n",
-        "\n",
-        "\n",
-        "\n",
-        "# azure call\n",
-        "response = completion(\n",
-        "    model = \"azure/your-azure-deployment\",\n",
-        "    messages = [{ \"content\": \"Hello, how are you?\",\"role\": \"user\"}]\n",
-        ")\n",
-        "print(\"Azure Response\\n\")\n",
-        "print(response)"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "5OSosWNCfc_2",
-        "outputId": "c52344b1-2458-4695-a7eb-a9b076893348"
-      },
-      "execution_count": 12,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Openai Response\n",
-            "\n",
-            "{\n",
-            "  \"id\": \"chatcmpl-7yjVOEKCPw2KdkfIaM3Ao1tIXp8EM\",\n",
-            "  \"object\": \"chat.completion\",\n",
-            "  \"created\": 1694708958,\n",
-            "  \"model\": \"gpt-3.5-turbo-0613\",\n",
-            "  \"choices\": [\n",
-            "    {\n",
-            "      \"index\": 0,\n",
-            "      \"message\": {\n",
-            "        \"role\": \"assistant\",\n",
-            "        \"content\": \"I'm an AI, so I don't have feelings, but I'm here to help you. How can I assist you?\"\n",
-            "      },\n",
-            "      \"finish_reason\": \"stop\"\n",
-            "    }\n",
-            "  ],\n",
-            "  \"usage\": {\n",
-            "    \"prompt_tokens\": 13,\n",
-            "    \"completion_tokens\": 26,\n",
-            "    \"total_tokens\": 39\n",
-            "  }\n",
-            "}\n",
-            "Azure Response\n",
-            "\n",
-            "{\n",
-            "  \"id\": \"chatcmpl-7yjVQ6m2R2HRtnKHRRFp6JzL4Fjez\",\n",
-            "  \"object\": \"chat.completion\",\n",
-            "  \"created\": 1694708960,\n",
-            "  \"model\": \"gpt-35-turbo\",\n",
-            "  \"choices\": [\n",
-            "    {\n",
-            "      \"index\": 0,\n",
-            "      \"finish_reason\": \"stop\",\n",
-            "      \"message\": {\n",
-            "        \"role\": \"assistant\",\n",
-            "        \"content\": \"Hello there! As an AI language model, I don't have feelings but I'm functioning well. How can I assist you today?\"\n",
-            "      }\n",
-            "    }\n",
-            "  ],\n",
-            "  \"usage\": {\n",
-            "    \"completion_tokens\": 27,\n",
-            "    \"prompt_tokens\": 14,\n",
-            "    \"total_tokens\": 41\n",
-            "  }\n",
-            "}\n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## Completion - Streaming"
-      ],
-      "metadata": {
-        "id": "dQMkM-diiKdE"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "import os\n",
-        "from litellm import completion\n",
-        "\n",
-        "# openai configs\n",
-        "os.environ[\"OPENAI_API_KEY\"] = \"\"\n",
-        "\n",
-        "# azure openai configs\n",
-        "os.environ[\"AZURE_API_KEY\"] = \"\"\n",
-        "os.environ[\"AZURE_API_BASE\"] = \"https://openai-gpt-4-test-v-1.openai.azure.com/\"\n",
-        "os.environ[\"AZURE_API_VERSION\"] = \"2023-05-15\"\n",
-        "\n",
-        "\n",
-        "# openai call\n",
-        "response = completion(\n",
-        "    model = \"gpt-3.5-turbo\",\n",
-        "    messages = [{ \"content\": \"Hello, how are you?\",\"role\": \"user\"}],\n",
-        "    stream=True\n",
-        ")\n",
-        "print(\"OpenAI Streaming response\")\n",
-        "for chunk in response:\n",
-        "  print(chunk)\n",
-        "\n",
-        "# azure call\n",
-        "response = completion(\n",
-        "    model = \"azure/your-azure-deployment\",\n",
-        "    messages = [{ \"content\": \"Hello, how are you?\",\"role\": \"user\"}],\n",
-        "    stream=True\n",
-        ")\n",
-        "print(\"Azure Streaming response\")\n",
-        "for chunk in response:\n",
-        "  print(chunk)\n"
-      ],
-      "metadata": {
-        "id": "uVvJDVn4g1i1"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## Completion - Azure, OpenAI in separate threads"
-      ],
-      "metadata": {
-        "id": "4xrOPnt-oqwm"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "import os\n",
-        "import threading\n",
-        "from litellm import completion\n",
-        "\n",
-        "# Function to make a completion call\n",
-        "def make_completion(model, messages):\n",
-        "    response = completion(\n",
-        "        model=model,\n",
-        "        messages=messages\n",
-        "    )\n",
-        "\n",
-        "    print(f\"Response for {model}: {response}\")\n",
-        "\n",
-        "# openai configs\n",
-        "os.environ[\"OPENAI_API_KEY\"] = \"\"\n",
-        "\n",
-        "# azure openai configs\n",
-        "os.environ[\"AZURE_API_KEY\"] = \"\"\n",
-        "os.environ[\"AZURE_API_BASE\"] = \"https://openai-gpt-4-test-v-1.openai.azure.com/\"\n",
-        "os.environ[\"AZURE_API_VERSION\"] = \"2023-05-15\"\n",
-        "\n",
-        "# Define the messages for the completions\n",
-        "messages = [{\"content\": \"Hello, how are you?\", \"role\": \"user\"}]\n",
-        "\n",
-        "# Create threads for making the completions\n",
-        "thread1 = threading.Thread(target=make_completion, args=(\"gpt-3.5-turbo\", messages))\n",
-        "thread2 = threading.Thread(target=make_completion, args=(\"azure/your-azure-deployment\", messages))\n",
-        "\n",
-        "# Start both threads\n",
-        "thread1.start()\n",
-        "thread2.start()\n",
-        "\n",
-        "# Wait for both threads to finish\n",
-        "thread1.join()\n",
-        "thread2.join()\n",
-        "\n",
-        "print(\"Both completions are done.\")"
-      ],
-      "metadata": {
-        "id": "V5b5taJPjvC3"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## Completion - Stress Test 10 requests in parallel\n",
-        "\n"
-      ],
-      "metadata": {
-        "id": "lx8DbMBqoAoN"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "import os\n",
-        "import threading\n",
-        "from litellm import completion\n",
-        "\n",
-        "# Function to make a completion call\n",
-        "def make_completion(model, messages):\n",
-        "    response = completion(\n",
-        "        model=model,\n",
-        "        messages=messages\n",
-        "    )\n",
-        "\n",
-        "    print(f\"Response for {model}: {response}\")\n",
-        "\n",
-        "# Set your API keys\n",
-        "os.environ[\"OPENAI_API_KEY\"] = \"\"\n",
-        "os.environ[\"AZURE_API_KEY\"] = \"\"\n",
-        "os.environ[\"AZURE_API_BASE\"] = \"https://openai-gpt-4-test-v-1.openai.azure.com/\"\n",
-        "os.environ[\"AZURE_API_VERSION\"] = \"2023-05-15\"\n",
-        "\n",
-        "# Define the messages for the completions\n",
-        "messages = [{\"content\": \"Hello, how are you?\", \"role\": \"user\"}]\n",
-        "\n",
-        "# Create and start 10 threads for making completions\n",
-        "threads = []\n",
-        "for i in range(10):\n",
-        "    thread = threading.Thread(target=make_completion, args=(\"gpt-3.5-turbo\" if i % 2 == 0 else \"azure/your-azure-deployment\", messages))\n",
-        "    threads.append(thread)\n",
-        "    thread.start()\n",
-        "\n",
-        "# Wait for all threads to finish\n",
-        "for thread in threads:\n",
-        "    thread.join()\n",
-        "\n",
-        "print(\"All completions are done.\")\n"
-      ],
-      "metadata": {
-        "id": "pHYANOlOkoDh"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## Completion - Azure, OpenAI in the same thread"
-      ],
-      "metadata": {
-        "id": "yB2NDOO4oxrp"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "import os\n",
-        "from litellm import completion\n",
-        "\n",
-        "# Function to make both OpenAI and Azure completions\n",
-        "def make_completions():\n",
-        "    # Set your OpenAI API key\n",
-        "    os.environ[\"OPENAI_API_KEY\"] = \"\"\n",
-        "\n",
-        "    # OpenAI completion\n",
-        "    openai_response = completion(\n",
-        "        model=\"gpt-3.5-turbo\",\n",
-        "        messages=[{\"content\": \"Hello, how are you?\", \"role\": \"user\"}]\n",
-        "    )\n",
-        "\n",
-        "    print(\"OpenAI Response:\", openai_response)\n",
-        "\n",
-        "    # Set your Azure OpenAI API key and configuration\n",
-        "    os.environ[\"AZURE_API_KEY\"] = \"\"\n",
-        "    os.environ[\"AZURE_API_BASE\"] = \"https://openai-gpt-4-test-v-1.openai.azure.com/\"\n",
-        "    os.environ[\"AZURE_API_VERSION\"] = \"2023-05-15\"\n",
-        "\n",
-        "    # Azure OpenAI completion\n",
-        "    azure_response = completion(\n",
-        "        model=\"azure/your-azure-deployment\",\n",
-        "        messages=[{\"content\": \"Hello, how are you?\", \"role\": \"user\"}]\n",
-        "    )\n",
-        "\n",
-        "    print(\"Azure OpenAI Response:\", azure_response)\n",
-        "\n",
-        "# Call the function to make both completions in one thread\n",
-        "make_completions()\n"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "HTBqwzxpnxab",
-        "outputId": "f3bc0efe-e4d5-44d5-a193-97d178cfbe14"
-      },
-      "execution_count": 23,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "OpenAI Response: {\n",
-            "  \"id\": \"chatcmpl-7yjzrDeOeVeSrQ00tApmTxEww3vBS\",\n",
-            "  \"object\": \"chat.completion\",\n",
-            "  \"created\": 1694710847,\n",
-            "  \"model\": \"gpt-3.5-turbo-0613\",\n",
-            "  \"choices\": [\n",
-            "    {\n",
-            "      \"index\": 0,\n",
-            "      \"message\": {\n",
-            "        \"role\": \"assistant\",\n",
-            "        \"content\": \"Hello! I'm an AI, so I don't have feelings, but I'm here to help you. How can I assist you today?\"\n",
-            "      },\n",
-            "      \"finish_reason\": \"stop\"\n",
-            "    }\n",
-            "  ],\n",
-            "  \"usage\": {\n",
-            "    \"prompt_tokens\": 13,\n",
-            "    \"completion_tokens\": 29,\n",
-            "    \"total_tokens\": 42\n",
-            "  }\n",
-            "}\n",
-            "Azure OpenAI Response: {\n",
-            "  \"id\": \"chatcmpl-7yjztAQ0gK6IMQt7cvLroMSOoXkeu\",\n",
-            "  \"object\": \"chat.completion\",\n",
-            "  \"created\": 1694710849,\n",
-            "  \"model\": \"gpt-35-turbo\",\n",
-            "  \"choices\": [\n",
-            "    {\n",
-            "      \"index\": 0,\n",
-            "      \"finish_reason\": \"stop\",\n",
-            "      \"message\": {\n",
-            "        \"role\": \"assistant\",\n",
-            "        \"content\": \"As an AI language model, I don't have feelings but I'm functioning properly. Thank you for asking! How can I assist you today?\"\n",
-            "      }\n",
-            "    }\n",
-            "  ],\n",
-            "  \"usage\": {\n",
-            "    \"completion_tokens\": 29,\n",
-            "    \"prompt_tokens\": 14,\n",
-            "    \"total_tokens\": 43\n",
-            "  }\n",
-            "}\n"
-          ]
-        }
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Openai Response\n",
+      "\n",
+      "{\n",
+      "  \"id\": \"chatcmpl-7yjVOEKCPw2KdkfIaM3Ao1tIXp8EM\",\n",
+      "  \"object\": \"chat.completion\",\n",
+      "  \"created\": 1694708958,\n",
+      "  \"model\": \"gpt-3.5-turbo-0613\",\n",
+      "  \"choices\": [\n",
+      "    {\n",
+      "      \"index\": 0,\n",
+      "      \"message\": {\n",
+      "        \"role\": \"assistant\",\n",
+      "        \"content\": \"I'm an AI, so I don't have feelings, but I'm here to help you. How can I assist you?\"\n",
+      "      },\n",
+      "      \"finish_reason\": \"stop\"\n",
+      "    }\n",
+      "  ],\n",
+      "  \"usage\": {\n",
+      "    \"prompt_tokens\": 13,\n",
+      "    \"completion_tokens\": 26,\n",
+      "    \"total_tokens\": 39\n",
+      "  }\n",
+      "}\n",
+      "Azure Response\n",
+      "\n",
+      "{\n",
+      "  \"id\": \"chatcmpl-7yjVQ6m2R2HRtnKHRRFp6JzL4Fjez\",\n",
+      "  \"object\": \"chat.completion\",\n",
+      "  \"created\": 1694708960,\n",
+      "  \"model\": \"gpt-35-turbo\",\n",
+      "  \"choices\": [\n",
+      "    {\n",
+      "      \"index\": 0,\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"message\": {\n",
+      "        \"role\": \"assistant\",\n",
+      "        \"content\": \"Hello there! As an AI language model, I don't have feelings but I'm functioning well. How can I assist you today?\"\n",
+      "      }\n",
+      "    }\n",
+      "  ],\n",
+      "  \"usage\": {\n",
+      "    \"completion_tokens\": 27,\n",
+      "    \"prompt_tokens\": 14,\n",
+      "    \"total_tokens\": 41\n",
+      "  }\n",
+      "}\n"
+     ]
    }
-  ]
+   ],
+   "source": [
+    "from litellm import completion\n",
+    "\n",
+    "# openai configs\n",
+    "os.environ[\"OPENAI_API_KEY\"] = \"\"\n",
+    "\n",
+    "# azure openai configs\n",
+    "os.environ[\"AZURE_API_KEY\"] = \"\"\n",
+    "os.environ[\"AZURE_API_BASE\"] = \"https://openai-gpt-4-test-v-1.openai.azure.com/\"\n",
+    "os.environ[\"AZURE_API_VERSION\"] = \"2023-05-15\"\n",
+    "\n",
+    "\n",
+    "# openai call\n",
+    "response = completion(\n",
+    "    model = \"gpt-3.5-turbo\",\n",
+    "    messages = [{ \"content\": \"Hello, how are you?\",\"role\": \"user\"}]\n",
+    ")\n",
+    "print(\"Openai Response\\n\")\n",
+    "print(response)\n",
+    "\n",
+    "\n",
+    "\n",
+    "# azure call\n",
+    "response = completion(\n",
+    "    model = \"azure/your-azure-deployment\",\n",
+    "    messages = [{ \"content\": \"Hello, how are you?\",\"role\": \"user\"}]\n",
+    ")\n",
+    "print(\"Azure Response\\n\")\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "dQMkM-diiKdE"
+   },
+   "source": [
+    "## Completion - Streaming"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "uVvJDVn4g1i1"
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from litellm import completion\n",
+    "\n",
+    "# openai configs\n",
+    "os.environ[\"OPENAI_API_KEY\"] = \"\"\n",
+    "\n",
+    "# azure openai configs\n",
+    "os.environ[\"AZURE_API_KEY\"] = \"\"\n",
+    "os.environ[\"AZURE_API_BASE\"] = \"https://openai-gpt-4-test-v-1.openai.azure.com/\"\n",
+    "os.environ[\"AZURE_API_VERSION\"] = \"2023-05-15\"\n",
+    "\n",
+    "\n",
+    "# openai call\n",
+    "response = completion(\n",
+    "    model = \"gpt-3.5-turbo\",\n",
+    "    messages = [{ \"content\": \"Hello, how are you?\",\"role\": \"user\"}],\n",
+    "    stream=True\n",
+    ")\n",
+    "print(\"OpenAI Streaming response\")\n",
+    "for chunk in response:\n",
+    "  print(chunk)\n",
+    "\n",
+    "# azure call\n",
+    "response = completion(\n",
+    "    model = \"azure/your-azure-deployment\",\n",
+    "    messages = [{ \"content\": \"Hello, how are you?\",\"role\": \"user\"}],\n",
+    "    stream=True\n",
+    ")\n",
+    "print(\"Azure Streaming response\")\n",
+    "for chunk in response:\n",
+    "  print(chunk)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "4xrOPnt-oqwm"
+   },
+   "source": [
+    "## Completion - Azure, OpenAI in separate threads"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "V5b5taJPjvC3"
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import threading\n",
+    "from litellm import completion\n",
+    "\n",
+    "# Function to make a completion call\n",
+    "def make_completion(model, messages):\n",
+    "    response = completion(\n",
+    "        model=model,\n",
+    "        messages=messages\n",
+    "    )\n",
+    "\n",
+    "    print(f\"Response for {model}: {response}\")\n",
+    "\n",
+    "# openai configs\n",
+    "os.environ[\"OPENAI_API_KEY\"] = \"\"\n",
+    "\n",
+    "# azure openai configs\n",
+    "os.environ[\"AZURE_API_KEY\"] = \"\"\n",
+    "os.environ[\"AZURE_API_BASE\"] = \"https://openai-gpt-4-test-v-1.openai.azure.com/\"\n",
+    "os.environ[\"AZURE_API_VERSION\"] = \"2023-05-15\"\n",
+    "\n",
+    "# Define the messages for the completions\n",
+    "messages = [{\"content\": \"Hello, how are you?\", \"role\": \"user\"}]\n",
+    "\n",
+    "# Create threads for making the completions\n",
+    "thread1 = threading.Thread(target=make_completion, args=(\"gpt-3.5-turbo\", messages))\n",
+    "thread2 = threading.Thread(target=make_completion, args=(\"azure/your-azure-deployment\", messages))\n",
+    "\n",
+    "# Start both threads\n",
+    "thread1.start()\n",
+    "thread2.start()\n",
+    "\n",
+    "# Wait for both threads to finish\n",
+    "thread1.join()\n",
+    "thread2.join()\n",
+    "\n",
+    "print(\"Both completions are done.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "lx8DbMBqoAoN"
+   },
+   "source": [
+    "## Completion - Stress Test 10 requests in parallel\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "pHYANOlOkoDh"
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import threading\n",
+    "from litellm import completion\n",
+    "\n",
+    "# Function to make a completion call\n",
+    "def make_completion(model, messages):\n",
+    "    response = completion(\n",
+    "        model=model,\n",
+    "        messages=messages\n",
+    "    )\n",
+    "\n",
+    "    print(f\"Response for {model}: {response}\")\n",
+    "\n",
+    "# Set your API keys\n",
+    "os.environ[\"OPENAI_API_KEY\"] = \"\"\n",
+    "os.environ[\"AZURE_API_KEY\"] = \"\"\n",
+    "os.environ[\"AZURE_API_BASE\"] = \"https://openai-gpt-4-test-v-1.openai.azure.com/\"\n",
+    "os.environ[\"AZURE_API_VERSION\"] = \"2023-05-15\"\n",
+    "\n",
+    "# Define the messages for the completions\n",
+    "messages = [{\"content\": \"Hello, how are you?\", \"role\": \"user\"}]\n",
+    "\n",
+    "# Create and start 10 threads for making completions\n",
+    "threads = []\n",
+    "for i in range(10):\n",
+    "    thread = threading.Thread(target=make_completion, args=(\"gpt-3.5-turbo\" if i % 2 == 0 else \"azure/your-azure-deployment\", messages))\n",
+    "    threads.append(thread)\n",
+    "    thread.start()\n",
+    "\n",
+    "# Wait for all threads to finish\n",
+    "for thread in threads:\n",
+    "    thread.join()\n",
+    "\n",
+    "print(\"All completions are done.\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "yB2NDOO4oxrp"
+   },
+   "source": [
+    "## Completion - Azure, OpenAI in the same thread"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "HTBqwzxpnxab",
+    "outputId": "f3bc0efe-e4d5-44d5-a193-97d178cfbe14"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "OpenAI Response: {\n",
+      "  \"id\": \"chatcmpl-7yjzrDeOeVeSrQ00tApmTxEww3vBS\",\n",
+      "  \"object\": \"chat.completion\",\n",
+      "  \"created\": 1694710847,\n",
+      "  \"model\": \"gpt-3.5-turbo-0613\",\n",
+      "  \"choices\": [\n",
+      "    {\n",
+      "      \"index\": 0,\n",
+      "      \"message\": {\n",
+      "        \"role\": \"assistant\",\n",
+      "        \"content\": \"Hello! I'm an AI, so I don't have feelings, but I'm here to help you. How can I assist you today?\"\n",
+      "      },\n",
+      "      \"finish_reason\": \"stop\"\n",
+      "    }\n",
+      "  ],\n",
+      "  \"usage\": {\n",
+      "    \"prompt_tokens\": 13,\n",
+      "    \"completion_tokens\": 29,\n",
+      "    \"total_tokens\": 42\n",
+      "  }\n",
+      "}\n",
+      "Azure OpenAI Response: {\n",
+      "  \"id\": \"chatcmpl-7yjztAQ0gK6IMQt7cvLroMSOoXkeu\",\n",
+      "  \"object\": \"chat.completion\",\n",
+      "  \"created\": 1694710849,\n",
+      "  \"model\": \"gpt-35-turbo\",\n",
+      "  \"choices\": [\n",
+      "    {\n",
+      "      \"index\": 0,\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"message\": {\n",
+      "        \"role\": \"assistant\",\n",
+      "        \"content\": \"As an AI language model, I don't have feelings but I'm functioning properly. Thank you for asking! How can I assist you today?\"\n",
+      "      }\n",
+      "    }\n",
+      "  ],\n",
+      "  \"usage\": {\n",
+      "    \"completion_tokens\": 29,\n",
+      "    \"prompt_tokens\": 14,\n",
+      "    \"total_tokens\": 43\n",
+      "  }\n",
+      "}\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "from litellm import completion\n",
+    "\n",
+    "# Function to make both OpenAI and Azure completions\n",
+    "def make_completions():\n",
+    "    # Set your OpenAI API key\n",
+    "    os.environ[\"OPENAI_API_KEY\"] = \"\"\n",
+    "\n",
+    "    # OpenAI completion\n",
+    "    openai_response = completion(\n",
+    "        model=\"gpt-3.5-turbo\",\n",
+    "        messages=[{\"content\": \"Hello, how are you?\", \"role\": \"user\"}]\n",
+    "    )\n",
+    "\n",
+    "    print(\"OpenAI Response:\", openai_response)\n",
+    "\n",
+    "    # Set your Azure OpenAI API key and configuration\n",
+    "    os.environ[\"AZURE_API_KEY\"] = \"\"\n",
+    "    os.environ[\"AZURE_API_BASE\"] = \"https://openai-gpt-4-test-v-1.openai.azure.com/\"\n",
+    "    os.environ[\"AZURE_API_VERSION\"] = \"2023-05-15\"\n",
+    "\n",
+    "    # Azure OpenAI completion\n",
+    "    azure_response = completion(\n",
+    "        model=\"azure/your-azure-deployment\",\n",
+    "        messages=[{\"content\": \"Hello, how are you?\", \"role\": \"user\"}]\n",
+    "    )\n",
+    "\n",
+    "    print(\"Azure OpenAI Response:\", azure_response)\n",
+    "\n",
+    "# Call the function to make both completions in one thread\n",
+    "make_completions()\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
 }
--- a/cookbook/LiteLLM_Comparing_LLMs.ipynb
+++ b/cookbook/LiteLLM_Comparing_LLMs.ipynb
--- a/cookbook/LiteLLM_batch_completion.ipynb
+++ b/cookbook/LiteLLM_batch_completion.ipynb
@ -1,166 +1,163 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    }
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "MbLbs1tbISk-"
+   },
+   "source": [
+    "# LiteLLM Batch Completions Example\n",
+    "\n",
+    "* This tutorial walks through using `batch_completion`\n",
+    "* Docs: https://docs.litellm.ai/docs/completion/batching"
+   ]
  },
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "source": [
-        "# LiteLLM Batch Completions Example\n",
-        "\n",
-        "* This tutorial walks through using `batch_completion`\n",
-        "* Docs: https://docs.litellm.ai/docs/completion/batching"
-      ],
-      "metadata": {
-        "id": "MbLbs1tbISk-"
-      }
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "Ty6-ko_aDlPF"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install litellm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "KGhNJRUCIh1j"
+   },
+   "source": [
+    "## Import Batch Completion"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "id": "LOtI43snDrSK"
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from litellm import batch_completion\n",
+    "\n",
+    "# set your API_KEY\n",
+    "os.environ['ANTHROPIC_API_KEY'] = \"\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Xhv92NBaIpaw"
+   },
+   "source": [
+    "## Calling `litellm.batch_completion`\n",
+    "\n",
+    "In the batch_completion method, you provide a list of messages where each sub-list of messages is passed to litellm.completion(), allowing you to process multiple prompts efficiently in a single API call."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
    },
+    "id": "yY7GIRLsDywu",
+    "outputId": "009ea67f-95d5-462b-947f-b0d21e60c5bb"
+   },
+   "outputs": [
    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "Ty6-ko_aDlPF"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install litellm"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## Import Batch Completion"
-      ],
-      "metadata": {
-        "id": "KGhNJRUCIh1j"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "import litellm\n",
-        "import os\n",
-        "from litellm import batch_completion\n",
-        "\n",
-        "# set your API_KEY\n",
-        "os.environ['ANTHROPIC_API_KEY'] = \"\""
-      ],
-      "metadata": {
-        "id": "LOtI43snDrSK"
-      },
-      "execution_count": 7,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## Calling `litellm.batch_completion`\n",
-        "\n",
-        "In the batch_completion method, you provide a list of messages where each sub-list of messages is passed to litellm.completion(), allowing you to process multiple prompts efficiently in a single API call."
-      ],
-      "metadata": {
-        "id": "Xhv92NBaIpaw"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "import litellm\n",
-        "import os\n",
-        "from litellm import batch_completion\n",
-        "\n",
-        "os.environ['ANTHROPIC_API_KEY'] = \"\"\n",
-        "\n",
-        "\n",
-        "responses = batch_completion(\n",
-        "    model=\"claude-2\",\n",
-        "    messages = [\n",
-        "        [\n",
-        "            {\n",
-        "                \"role\": \"user\",\n",
-        "                \"content\": \"good morning? \"\n",
-        "            }\n",
-        "        ],\n",
-        "        [\n",
-        "            {\n",
-        "                \"role\": \"user\",\n",
-        "                \"content\": \"what's the time? \"\n",
-        "            }\n",
-        "        ]\n",
-        "    ]\n",
-        ")\n",
-        "responses"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "yY7GIRLsDywu",
-        "outputId": "009ea67f-95d5-462b-947f-b0d21e60c5bb"
-      },
-      "execution_count": 11,
-      "outputs": [
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "[<ModelResponse at 0x7a164eed4450> JSON: {\n",
-              "   \"choices\": [\n",
-              "     {\n",
-              "       \"finish_reason\": \"stop\",\n",
-              "       \"index\": 0,\n",
-              "       \"message\": {\n",
-              "         \"content\": \" Good morning!\",\n",
-              "         \"role\": \"assistant\",\n",
-              "         \"logprobs\": null\n",
-              "       }\n",
-              "     }\n",
-              "   ],\n",
-              "   \"created\": 1694030351.309254,\n",
-              "   \"model\": \"claude-2\",\n",
-              "   \"usage\": {\n",
-              "     \"prompt_tokens\": 11,\n",
-              "     \"completion_tokens\": 3,\n",
-              "     \"total_tokens\": 14\n",
-              "   }\n",
-              " },\n",
-              " <ModelResponse at 0x7a164eed5800> JSON: {\n",
-              "   \"choices\": [\n",
-              "     {\n",
-              "       \"finish_reason\": \"stop\",\n",
-              "       \"index\": 0,\n",
-              "       \"message\": {\n",
-              "         \"content\": \" I'm an AI assistant created by Anthropic. I don't actually have a concept of the current time.\",\n",
-              "         \"role\": \"assistant\",\n",
-              "         \"logprobs\": null\n",
-              "       }\n",
-              "     }\n",
-              "   ],\n",
-              "   \"created\": 1694030352.1215081,\n",
-              "   \"model\": \"claude-2\",\n",
-              "   \"usage\": {\n",
-              "     \"prompt_tokens\": 13,\n",
-              "     \"completion_tokens\": 22,\n",
-              "     \"total_tokens\": 35\n",
-              "   }\n",
-              " }]"
-            ]
-          },
-          "metadata": {},
-          "execution_count": 11
-        }
+     "data": {
+      "text/plain": [
+       "[<ModelResponse at 0x7a164eed4450> JSON: {\n",
+       "   \"choices\": [\n",
+       "     {\n",
+       "       \"finish_reason\": \"stop\",\n",
+       "       \"index\": 0,\n",
+       "       \"message\": {\n",
+       "         \"content\": \" Good morning!\",\n",
+       "         \"role\": \"assistant\",\n",
+       "         \"logprobs\": null\n",
+       "       }\n",
+       "     }\n",
+       "   ],\n",
+       "   \"created\": 1694030351.309254,\n",
+       "   \"model\": \"claude-2\",\n",
+       "   \"usage\": {\n",
+       "     \"prompt_tokens\": 11,\n",
+       "     \"completion_tokens\": 3,\n",
+       "     \"total_tokens\": 14\n",
+       "   }\n",
+       " },\n",
+       " <ModelResponse at 0x7a164eed5800> JSON: {\n",
+       "   \"choices\": [\n",
+       "     {\n",
+       "       \"finish_reason\": \"stop\",\n",
+       "       \"index\": 0,\n",
+       "       \"message\": {\n",
+       "         \"content\": \" I'm an AI assistant created by Anthropic. I don't actually have a concept of the current time.\",\n",
+       "         \"role\": \"assistant\",\n",
+       "         \"logprobs\": null\n",
+       "       }\n",
+       "     }\n",
+       "   ],\n",
+       "   \"created\": 1694030352.1215081,\n",
+       "   \"model\": \"claude-2\",\n",
+       "   \"usage\": {\n",
+       "     \"prompt_tokens\": 13,\n",
+       "     \"completion_tokens\": 22,\n",
+       "     \"total_tokens\": 35\n",
+       "   }\n",
+       " }]"
      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
    }
-  ]
+   ],
+   "source": [
+    "import os\n",
+    "\n",
+    "os.environ['ANTHROPIC_API_KEY'] = \"\"\n",
+    "\n",
+    "\n",
+    "responses = batch_completion(\n",
+    "    model=\"claude-2\",\n",
+    "    messages = [\n",
+    "        [\n",
+    "            {\n",
+    "                \"role\": \"user\",\n",
+    "                \"content\": \"good morning? \"\n",
+    "            }\n",
+    "        ],\n",
+    "        [\n",
+    "            {\n",
+    "                \"role\": \"user\",\n",
+    "                \"content\": \"what's the time? \"\n",
+    "            }\n",
+    "        ]\n",
+    "    ]\n",
+    ")\n",
+    "responses"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
 }
--- a/cookbook/Proxy_Batch_Users.ipynb
+++ b/cookbook/Proxy_Batch_Users.ipynb
@ -1,204 +1,205 @@
 {
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "680oRk1af-xJ"
-      },
-      "source": [
-        "# Environment Setup"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "X7TgJFn8f88p"
-      },
-      "outputs": [],
-      "source": [
-        "import csv\n",
-        "from typing import Optional\n",
-        "import httpx, json\n",
-        "import asyncio\n",
-        "\n",
-        "proxy_base_url = \"http://0.0.0.0:4000\" # 👈 SET TO PROXY URL\n",
-        "master_key = \"sk-1234\" # 👈 SET TO PROXY MASTER KEY"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "rauw8EOhgBz5"
-      },
-      "outputs": [],
-      "source": [
-        "## GLOBAL HTTP CLIENT ## - faster http calls\n",
-        "class HTTPHandler:\n",
-        "    def __init__(self, concurrent_limit=1000):\n",
-        "        # Create a client with a connection pool\n",
-        "        self.client = httpx.AsyncClient(\n",
-        "            limits=httpx.Limits(\n",
-        "                max_connections=concurrent_limit,\n",
-        "                max_keepalive_connections=concurrent_limit,\n",
-        "            )\n",
-        "        )\n",
-        "\n",
-        "    async def close(self):\n",
-        "        # Close the client when you're done with it\n",
-        "        await self.client.aclose()\n",
-        "\n",
-        "    async def get(\n",
-        "        self, url: str, params: Optional[dict] = None, headers: Optional[dict] = None\n",
-        "    ):\n",
-        "        response = await self.client.get(url, params=params, headers=headers)\n",
-        "        return response\n",
-        "\n",
-        "    async def post(\n",
-        "        self,\n",
-        "        url: str,\n",
-        "        data: Optional[dict] = None,\n",
-        "        params: Optional[dict] = None,\n",
-        "        headers: Optional[dict] = None,\n",
-        "    ):\n",
-        "        try:\n",
-        "            response = await self.client.post(\n",
-        "                url, data=data, params=params, headers=headers\n",
-        "            )\n",
-        "            return response\n",
-        "        except Exception as e:\n",
-        "            raise e\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "7LXN8zaLgOie"
-      },
-      "source": [
-        "# Import Sheet\n",
-        "\n",
-        "\n",
-        "Format: | ID | Name | Max Budget |"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "oiED0usegPGf"
-      },
-      "outputs": [],
-      "source": [
-        "async def import_sheet():\n",
-        "    tasks = []\n",
-        "    http_client = HTTPHandler()\n",
-        "    with open('my-batch-sheet.csv', 'r') as file:\n",
-        "        csv_reader = csv.DictReader(file)\n",
-        "        for row in csv_reader:\n",
-        "            task = create_user(client=http_client, user_id=row['ID'], max_budget=row['Max Budget'], user_name=row['Name'])\n",
-        "            tasks.append(task)\n",
-        "            # print(f\"ID: {row['ID']}, Name: {row['Name']}, Max Budget: {row['Max Budget']}\")\n",
-        "\n",
-        "    keys = await asyncio.gather(*tasks)\n",
-        "\n",
-        "    with open('my-batch-sheet_new.csv', 'w', newline='') as new_file:\n",
-        "        fieldnames = ['ID', 'Name', 'Max Budget', 'keys']\n",
-        "        csv_writer = csv.DictWriter(new_file, fieldnames=fieldnames)\n",
-        "        csv_writer.writeheader()\n",
-        "\n",
-        "        with open('my-batch-sheet.csv', 'r') as file:\n",
-        "            csv_reader = csv.DictReader(file)\n",
-        "            for i, row in enumerate(csv_reader):\n",
-        "                row['keys'] = keys[i]  # Add the 'keys' value from the corresponding task result\n",
-        "                csv_writer.writerow(row)\n",
-        "\n",
-        "    await http_client.close()\n",
-        "\n",
-        "asyncio.run(import_sheet())"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "E7M0Li_UgJeZ"
-      },
-      "source": [
-        "# Create Users + Keys\n",
-        "\n",
-        "- Creates a user\n",
-        "- Creates a key with max budget"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "NZudRFujf7j-"
-      },
-      "outputs": [],
-      "source": [
-        "\n",
-        "async def create_key_with_alias(client: HTTPHandler, user_id: str, max_budget: float):\n",
-        "    global proxy_base_url\n",
-        "    if not proxy_base_url.endswith(\"/\"):\n",
-        "        proxy_base_url += \"/\"\n",
-        "    url = proxy_base_url + \"key/generate\"\n",
-        "\n",
-        "    # call /key/generate\n",
-        "    print(\"CALLING /KEY/GENERATE\")\n",
-        "    response = await client.post(\n",
-        "        url=url,\n",
-        "        headers={\"Authorization\": f\"Bearer {master_key}\"},\n",
-        "        data=json.dumps({\n",
-        "            \"user_id\": user_id,\n",
-        "            \"key_alias\": f\"{user_id}-key\",\n",
-        "            \"max_budget\": max_budget # 👈 KEY CHANGE: SETS MAX BUDGET PER KEY\n",
-        "        })\n",
-        "    )\n",
-        "    print(f\"response: {response.text}\")\n",
-        "    return response.json()[\"key\"]\n",
-        "\n",
-        "async def create_user(client: HTTPHandler, user_id: str, max_budget: float, user_name: str):\n",
-        "    \"\"\"\n",
-        "    - call /user/new\n",
-        "    - create key for user\n",
-        "    \"\"\"\n",
-        "    global proxy_base_url\n",
-        "    if not proxy_base_url.endswith(\"/\"):\n",
-        "        proxy_base_url += \"/\"\n",
-        "    url = proxy_base_url + \"user/new\"\n",
-        "\n",
-        "    # call /user/new\n",
-        "    await client.post(\n",
-        "        url=url,\n",
-        "        headers={\"Authorization\": f\"Bearer {master_key}\"},\n",
-        "        data=json.dumps({\n",
-        "            \"user_id\": user_id,\n",
-        "            \"user_alias\": user_name,\n",
-        "            \"auto_create_key\": False,\n",
-        "            # \"max_budget\": max_budget # 👈 [OPTIONAL] Sets max budget per user (if you want to set a max budget across keys)\n",
-        "        })\n",
-        "    )\n",
-        "\n",
-        "    # create key for user\n",
-        "    return await create_key_with_alias(client=client, user_id=user_id, max_budget=max_budget)\n"
-      ]
-    }
-  ],
-  "metadata": {
-    "colab": {
-      "provenance": []
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    },
-    "language_info": {
-      "name": "python"
-    }
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "680oRk1af-xJ"
+   },
+   "source": [
+    "# Environment Setup"
+   ]
  },
-  "nbformat": 4,
-  "nbformat_minor": 0
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "X7TgJFn8f88p"
+   },
+   "outputs": [],
+   "source": [
+    "import csv\n",
+    "from typing import Optional\n",
+    "import httpx\n",
+    "import json\n",
+    "import asyncio\n",
+    "\n",
+    "proxy_base_url = \"http://0.0.0.0:4000\" # 👈 SET TO PROXY URL\n",
+    "master_key = \"sk-1234\" # 👈 SET TO PROXY MASTER KEY"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "rauw8EOhgBz5"
+   },
+   "outputs": [],
+   "source": [
+    "## GLOBAL HTTP CLIENT ## - faster http calls\n",
+    "class HTTPHandler:\n",
+    "    def __init__(self, concurrent_limit=1000):\n",
+    "        # Create a client with a connection pool\n",
+    "        self.client = httpx.AsyncClient(\n",
+    "            limits=httpx.Limits(\n",
+    "                max_connections=concurrent_limit,\n",
+    "                max_keepalive_connections=concurrent_limit,\n",
+    "            )\n",
+    "        )\n",
+    "\n",
+    "    async def close(self):\n",
+    "        # Close the client when you're done with it\n",
+    "        await self.client.aclose()\n",
+    "\n",
+    "    async def get(\n",
+    "        self, url: str, params: Optional[dict] = None, headers: Optional[dict] = None\n",
+    "    ):\n",
+    "        response = await self.client.get(url, params=params, headers=headers)\n",
+    "        return response\n",
+    "\n",
+    "    async def post(\n",
+    "        self,\n",
+    "        url: str,\n",
+    "        data: Optional[dict] = None,\n",
+    "        params: Optional[dict] = None,\n",
+    "        headers: Optional[dict] = None,\n",
+    "    ):\n",
+    "        try:\n",
+    "            response = await self.client.post(\n",
+    "                url, data=data, params=params, headers=headers\n",
+    "            )\n",
+    "            return response\n",
+    "        except Exception as e:\n",
+    "            raise e\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "7LXN8zaLgOie"
+   },
+   "source": [
+    "# Import Sheet\n",
+    "\n",
+    "\n",
+    "Format: | ID | Name | Max Budget |"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "oiED0usegPGf"
+   },
+   "outputs": [],
+   "source": [
+    "async def import_sheet():\n",
+    "    tasks = []\n",
+    "    http_client = HTTPHandler()\n",
+    "    with open('my-batch-sheet.csv', 'r') as file:\n",
+    "        csv_reader = csv.DictReader(file)\n",
+    "        for row in csv_reader:\n",
+    "            task = create_user(client=http_client, user_id=row['ID'], max_budget=row['Max Budget'], user_name=row['Name'])\n",
+    "            tasks.append(task)\n",
+    "            # print(f\"ID: {row['ID']}, Name: {row['Name']}, Max Budget: {row['Max Budget']}\")\n",
+    "\n",
+    "    keys = await asyncio.gather(*tasks)\n",
+    "\n",
+    "    with open('my-batch-sheet_new.csv', 'w', newline='') as new_file:\n",
+    "        fieldnames = ['ID', 'Name', 'Max Budget', 'keys']\n",
+    "        csv_writer = csv.DictWriter(new_file, fieldnames=fieldnames)\n",
+    "        csv_writer.writeheader()\n",
+    "\n",
+    "        with open('my-batch-sheet.csv', 'r') as file:\n",
+    "            csv_reader = csv.DictReader(file)\n",
+    "            for i, row in enumerate(csv_reader):\n",
+    "                row['keys'] = keys[i]  # Add the 'keys' value from the corresponding task result\n",
+    "                csv_writer.writerow(row)\n",
+    "\n",
+    "    await http_client.close()\n",
+    "\n",
+    "asyncio.run(import_sheet())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "E7M0Li_UgJeZ"
+   },
+   "source": [
+    "# Create Users + Keys\n",
+    "\n",
+    "- Creates a user\n",
+    "- Creates a key with max budget"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "NZudRFujf7j-"
+   },
+   "outputs": [],
+   "source": [
+    "\n",
+    "async def create_key_with_alias(client: HTTPHandler, user_id: str, max_budget: float):\n",
+    "    global proxy_base_url\n",
+    "    if not proxy_base_url.endswith(\"/\"):\n",
+    "        proxy_base_url += \"/\"\n",
+    "    url = proxy_base_url + \"key/generate\"\n",
+    "\n",
+    "    # call /key/generate\n",
+    "    print(\"CALLING /KEY/GENERATE\")\n",
+    "    response = await client.post(\n",
+    "        url=url,\n",
+    "        headers={\"Authorization\": f\"Bearer {master_key}\"},\n",
+    "        data=json.dumps({\n",
+    "            \"user_id\": user_id,\n",
+    "            \"key_alias\": f\"{user_id}-key\",\n",
+    "            \"max_budget\": max_budget # 👈 KEY CHANGE: SETS MAX BUDGET PER KEY\n",
+    "        })\n",
+    "    )\n",
+    "    print(f\"response: {response.text}\")\n",
+    "    return response.json()[\"key\"]\n",
+    "\n",
+    "async def create_user(client: HTTPHandler, user_id: str, max_budget: float, user_name: str):\n",
+    "    \"\"\"\n",
+    "    - call /user/new\n",
+    "    - create key for user\n",
+    "    \"\"\"\n",
+    "    global proxy_base_url\n",
+    "    if not proxy_base_url.endswith(\"/\"):\n",
+    "        proxy_base_url += \"/\"\n",
+    "    url = proxy_base_url + \"user/new\"\n",
+    "\n",
+    "    # call /user/new\n",
+    "    await client.post(\n",
+    "        url=url,\n",
+    "        headers={\"Authorization\": f\"Bearer {master_key}\"},\n",
+    "        data=json.dumps({\n",
+    "            \"user_id\": user_id,\n",
+    "            \"user_alias\": user_name,\n",
+    "            \"auto_create_key\": False,\n",
+    "            # \"max_budget\": max_budget # 👈 [OPTIONAL] Sets max budget per user (if you want to set a max budget across keys)\n",
+    "        })\n",
+    "    )\n",
+    "\n",
+    "    # create key for user\n",
+    "    return await create_key_with_alias(client=client, user_id=user_id, max_budget=max_budget)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
 }
--- a/cookbook/TogetherAI_liteLLM.ipynb
+++ b/cookbook/TogetherAI_liteLLM.ipynb
--- a/cookbook/Using_Nemo_Guardrails_with_LiteLLM_Server.ipynb
+++ b/cookbook/Using_Nemo_Guardrails_with_LiteLLM_Server.ipynb
@ -1,159 +1,157 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    }
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "eKXncoQbU_2j"
+   },
+   "source": [
+    "# Using Nemo-Guardrails with LiteLLM Server\n",
+    "\n",
+    "[Call Bedrock, TogetherAI, Huggingface, etc. on the server](https://docs.litellm.ai/docs/providers)"
+   ]
  },
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "source": [
-        "# Using Nemo-Guardrails with LiteLLM Server\n",
-        "\n",
-        "[Call Bedrock, TogetherAI, Huggingface, etc. on the server](https://docs.litellm.ai/docs/providers)"
-      ],
-      "metadata": {
-        "id": "eKXncoQbU_2j"
-      }
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## Using with Bedrock\n",
-        "\n",
-        "`docker run -e PORT=8000 -e AWS_ACCESS_KEY_ID=<your-aws-access-key> -e AWS_SECRET_ACCESS_KEY=<your-aws-secret-key> -p 8000:8000 ghcr.io/berriai/litellm:latest`"
-      ],
-      "metadata": {
-        "id": "ZciYaLwvuFbu"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "pip install nemoguardrails langchain"
-      ],
-      "metadata": {
-        "id": "vOUwGSJ2Vsy3"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "xXEJNxe7U0IN"
-      },
-      "outputs": [],
-      "source": [
-        "import openai\n",
-        "from langchain.chat_models import ChatOpenAI\n",
-        "\n",
-        "llm = ChatOpenAI(model_name=\"anthropic.claude-v2\", openai_api_base=\"http://0.0.0.0:8000\", openai_api_key=\"my-fake-key\")\n",
-        "\n",
-        "from nemoguardrails import LLMRails, RailsConfig\n",
-        "\n",
-        "config = RailsConfig.from_path(\"./config.yml\")\n",
-        "app = LLMRails(config, llm=llm)\n",
-        "\n",
-        "new_message = app.generate(messages=[{\n",
-        "    \"role\": \"user\",\n",
-        "    \"content\": \"Hello! What can you do for me?\"\n",
-        "}])"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## Using with TogetherAI\n",
-        "\n",
-        "1. You can either set this in the server environment:\n",
-        "`docker run -e PORT=8000 -e TOGETHERAI_API_KEY=<your-together-ai-api-key> -p 8000:8000 ghcr.io/berriai/litellm:latest`\n",
-        "\n",
-        "2. **Or** Pass this in as the api key `(...openai_api_key=\"<your-together-ai-api-key>\")`"
-      ],
-      "metadata": {
-        "id": "vz5n00qyuKjp"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "import openai\n",
-        "from langchain.chat_models import ChatOpenAI\n",
-        "\n",
-        "llm = ChatOpenAI(model_name=\"together_ai/togethercomputer/CodeLlama-13b-Instruct\", openai_api_base=\"http://0.0.0.0:8000\", openai_api_key=\"my-together-ai-api-key\")\n",
-        "\n",
-        "from nemoguardrails import LLMRails, RailsConfig\n",
-        "\n",
-        "config = RailsConfig.from_path(\"./config.yml\")\n",
-        "app = LLMRails(config, llm=llm)\n",
-        "\n",
-        "new_message = app.generate(messages=[{\n",
-        "    \"role\": \"user\",\n",
-        "    \"content\": \"Hello! What can you do for me?\"\n",
-        "}])"
-      ],
-      "metadata": {
-        "id": "XK1sk-McuhpE"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "### CONFIG.YML\n",
-        "\n",
-        "save this example `config.yml` in your current directory"
-      ],
-      "metadata": {
-        "id": "8A1KWKnzuxAS"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# instructions:\n",
-        "#   - type: general\n",
-        "#     content: |\n",
-        "#       Below is a conversation between a bot and a user about the recent job reports.\n",
-        "#       The bot is factual and concise. If the bot does not know the answer to a\n",
-        "#       question, it truthfully says it does not know.\n",
-        "\n",
-        "# sample_conversation: |\n",
-        "#   user \"Hello there!\"\n",
-        "#     express greeting\n",
-        "#   bot express greeting\n",
-        "#     \"Hello! How can I assist you today?\"\n",
-        "#   user \"What can you do for me?\"\n",
-        "#     ask about capabilities\n",
-        "#   bot respond about capabilities\n",
-        "#     \"I am an AI assistant that helps answer mathematical questions. My core mathematical skills are powered by wolfram alpha.\"\n",
-        "#   user \"What's 2+2?\"\n",
-        "#     ask math question\n",
-        "#   bot responds to math question\n",
-        "#     \"2+2 is equal to 4.\"\n",
-        "\n",
-        "# models:\n",
-        "#   - type: main\n",
-        "#     engine: openai\n",
-        "#     model: claude-instant-1"
-      ],
-      "metadata": {
-        "id": "NKN1GmSvu0Cx"
-      },
-      "execution_count": null,
-      "outputs": []
-    }
-  ]
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "ZciYaLwvuFbu"
+   },
+   "source": [
+    "## Using with Bedrock\n",
+    "\n",
+    "`docker run -e PORT=8000 -e AWS_ACCESS_KEY_ID=<your-aws-access-key> -e AWS_SECRET_ACCESS_KEY=<your-aws-secret-key> -p 8000:8000 ghcr.io/berriai/litellm:latest`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "vOUwGSJ2Vsy3"
+   },
+   "outputs": [],
+   "source": [
+    "pip install nemoguardrails langchain"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "xXEJNxe7U0IN"
+   },
+   "outputs": [],
+   "source": [
+    "from langchain.chat_models import ChatOpenAI\n",
+    "\n",
+    "llm = ChatOpenAI(model_name=\"anthropic.claude-v2\", openai_api_base=\"http://0.0.0.0:8000\", openai_api_key=\"my-fake-key\")\n",
+    "\n",
+    "from nemoguardrails import LLMRails, RailsConfig\n",
+    "\n",
+    "config = RailsConfig.from_path(\"./config.yml\")\n",
+    "app = LLMRails(config, llm=llm)\n",
+    "\n",
+    "new_message = app.generate(messages=[{\n",
+    "    \"role\": \"user\",\n",
+    "    \"content\": \"Hello! What can you do for me?\"\n",
+    "}])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "vz5n00qyuKjp"
+   },
+   "source": [
+    "## Using with TogetherAI\n",
+    "\n",
+    "1. You can either set this in the server environment:\n",
+    "`docker run -e PORT=8000 -e TOGETHERAI_API_KEY=<your-together-ai-api-key> -p 8000:8000 ghcr.io/berriai/litellm:latest`\n",
+    "\n",
+    "2. **Or** Pass this in as the api key `(...openai_api_key=\"<your-together-ai-api-key>\")`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "XK1sk-McuhpE"
+   },
+   "outputs": [],
+   "source": [
+    "from langchain.chat_models import ChatOpenAI\n",
+    "\n",
+    "llm = ChatOpenAI(model_name=\"together_ai/togethercomputer/CodeLlama-13b-Instruct\", openai_api_base=\"http://0.0.0.0:8000\", openai_api_key=\"my-together-ai-api-key\")\n",
+    "\n",
+    "from nemoguardrails import LLMRails, RailsConfig\n",
+    "\n",
+    "config = RailsConfig.from_path(\"./config.yml\")\n",
+    "app = LLMRails(config, llm=llm)\n",
+    "\n",
+    "new_message = app.generate(messages=[{\n",
+    "    \"role\": \"user\",\n",
+    "    \"content\": \"Hello! What can you do for me?\"\n",
+    "}])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "8A1KWKnzuxAS"
+   },
+   "source": [
+    "### CONFIG.YML\n",
+    "\n",
+    "save this example `config.yml` in your current directory"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "NKN1GmSvu0Cx"
+   },
+   "outputs": [],
+   "source": [
+    "# instructions:\n",
+    "#   - type: general\n",
+    "#     content: |\n",
+    "#       Below is a conversation between a bot and a user about the recent job reports.\n",
+    "#       The bot is factual and concise. If the bot does not know the answer to a\n",
+    "#       question, it truthfully says it does not know.\n",
+    "\n",
+    "# sample_conversation: |\n",
+    "#   user \"Hello there!\"\n",
+    "#     express greeting\n",
+    "#   bot express greeting\n",
+    "#     \"Hello! How can I assist you today?\"\n",
+    "#   user \"What can you do for me?\"\n",
+    "#     ask about capabilities\n",
+    "#   bot respond about capabilities\n",
+    "#     \"I am an AI assistant that helps answer mathematical questions. My core mathematical skills are powered by wolfram alpha.\"\n",
+    "#   user \"What's 2+2?\"\n",
+    "#     ask math question\n",
+    "#   bot responds to math question\n",
+    "#     \"2+2 is equal to 4.\"\n",
+    "\n",
+    "# models:\n",
+    "#   - type: main\n",
+    "#     engine: openai\n",
+    "#     model: claude-instant-1"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
 }
--- a/cookbook/benchmark/eval_suites_mlflow_autoevals/auto_evals.py
+++ b/cookbook/benchmark/eval_suites_mlflow_autoevals/auto_evals.py
@ -1,16 +1,12 @@
-import sys, os
-import traceback
 from dotenv import load_dotenv

 load_dotenv()

 import litellm
-from litellm import embedding, completion, completion_cost

 from autoevals.llm import *

 ###################
-import litellm

 # litellm completion call
 question = "which country has the highest population"
--- a/cookbook/codellama-server/main.py
+++ b/cookbook/codellama-server/main.py
@ -1,11 +1,12 @@
 import traceback
-from flask import Flask, request, jsonify, abort, Response
+from flask import Flask, request, Response
 from flask_cors import CORS
-import traceback
 import litellm
 from util import handle_error
 from litellm import completion
-import os, dotenv, time
+import os
+import dotenv
+import time
 import json

 dotenv.load_dotenv()
@ -20,9 +21,9 @@ verbose = True

 # litellm.caching_with_models = True # CACHING: caching_with_models Keys in the cache are messages + model. - to learn more: https://docs.litellm.ai/docs/caching/
 ######### PROMPT LOGGING ##########
-os.environ[
-    "PROMPTLAYER_API_KEY"
-] = ""  # set your promptlayer key here - https://promptlayer.com/
+os.environ["PROMPTLAYER_API_KEY"] = (
+    ""  # set your promptlayer key here - https://promptlayer.com/
+)

 # set callbacks
 litellm.success_callback = ["promptlayer"]
@ -57,9 +58,9 @@ def api_completion():
    try:
        if "prompt" not in data:
            raise ValueError("data needs to have prompt")
-        data[
-            "model"
-        ] = "togethercomputer/CodeLlama-34b-Instruct"  # by default use Together AI's CodeLlama model - https://api.together.xyz/playground/chat?model=togethercomputer%2FCodeLlama-34b-Instruct
+        data["model"] = (
+            "togethercomputer/CodeLlama-34b-Instruct"  # by default use Together AI's CodeLlama model - https://api.together.xyz/playground/chat?model=togethercomputer%2FCodeLlama-34b-Instruct
+        )
        # COMPLETION CALL
        system_prompt = "Only respond to questions about code. Say 'I don't know' to anything outside of that."
        messages = [
@ -75,7 +76,7 @@ def api_completion():
            "stream" in data and data["stream"] == True
        ):  # use generate_responses to stream responses
            return Response(data_generator(response), mimetype="text/event-stream")
-    except Exception as e:
+    except Exception:
        # call handle_error function
        print_verbose(f"Got Error api_completion(): {traceback.format_exc()}")
        ## LOG FAILURE
--- a/cookbook/community-resources/get_hf_models.py
+++ b/cookbook/community-resources/get_hf_models.py
@ -1,5 +1,4 @@
 import requests
-from urllib.parse import urlparse, parse_qs


 def get_next_url(response):
--- a/cookbook/liteLLM_Baseten.ipynb
+++ b/cookbook/liteLLM_Baseten.ipynb
@ -1,238 +1,237 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    }
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "gZx-wHJapG5w"
+   },
+   "source": [
+    "# Use liteLLM to call Falcon, Wizard, MPT 7B using OpenAI chatGPT Input/output\n",
+    "\n",
+    "* Falcon 7B: https://app.baseten.co/explore/falcon_7b\n",
+    "* Wizard LM: https://app.baseten.co/explore/wizardlm\n",
+    "* MPT 7B Base: https://app.baseten.co/explore/mpt_7b_instruct\n",
+    "\n",
+    "\n",
+    "## Call all baseten llm models using OpenAI chatGPT Input/Output using liteLLM\n",
+    "Example call\n",
+    "```python\n",
+    "model = \"q841o8w\" # baseten model version ID\n",
+    "response = completion(model=model, messages=messages, custom_llm_provider=\"baseten\")\n",
+    "```"
+   ]
  },
-  "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "4JSRa0QVogPo"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install litellm==0.1.399\n",
+    "!pip install baseten urllib3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "id": "VEukLhDzo4vw"
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from litellm import completion"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "4STYM2OHFNlc"
+   },
+   "source": [
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {
+    "id": "DorpLxw1FHbC"
+   },
+   "outputs": [],
+   "source": [
+    "os.environ['BASETEN_API_KEY'] = \"\" #@param\n",
+    "messages = [{ \"content\": \"what does Baseten do? \",\"role\": \"user\"}]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "syF3dTdKFSQQ"
+   },
+   "source": [
+    "## Calling Falcon 7B: https://app.baseten.co/explore/falcon_7b\n",
+    "### Pass Your Baseten model `Version ID` as `model`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "rPgSoMlsojz0",
+    "outputId": "81d6dc7b-1681-4ae4-e4c8-5684eb1bd050"
+   },
+   "outputs": [
    {
-      "cell_type": "markdown",
-      "source": [
-        "# Use liteLLM to call Falcon, Wizard, MPT 7B using OpenAI chatGPT Input/output\n",
-        "\n",
-        "* Falcon 7B: https://app.baseten.co/explore/falcon_7b\n",
-        "* Wizard LM: https://app.baseten.co/explore/wizardlm\n",
-        "* MPT 7B Base: https://app.baseten.co/explore/mpt_7b_instruct\n",
-        "\n",
-        "\n",
-        "## Call all baseten llm models using OpenAI chatGPT Input/Output using liteLLM\n",
-        "Example call\n",
-        "```python\n",
-        "model = \"q841o8w\" # baseten model version ID\n",
-        "response = completion(model=model, messages=messages, custom_llm_provider=\"baseten\")\n",
-        "```"
-      ],
-      "metadata": {
-        "id": "gZx-wHJapG5w"
-      }
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32mINFO\u001b[0m API key set.\n",
+      "INFO:baseten:API key set.\n"
+     ]
    },
    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "4JSRa0QVogPo"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install litellm==0.1.399\n",
-        "!pip install baseten urllib3"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "import os\n",
-        "import litellm\n",
-        "from litellm import completion"
-      ],
-      "metadata": {
-        "id": "VEukLhDzo4vw"
-      },
-      "execution_count": 2,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## Setup"
-      ],
-      "metadata": {
-        "id": "4STYM2OHFNlc"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "os.environ['BASETEN_API_KEY'] = \"\" #@param\n",
-        "messages = [{ \"content\": \"what does Baseten do? \",\"role\": \"user\"}]"
-      ],
-      "metadata": {
-        "id": "DorpLxw1FHbC"
-      },
-      "execution_count": 21,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## Calling Falcon 7B: https://app.baseten.co/explore/falcon_7b\n",
-        "### Pass Your Baseten model `Version ID` as `model`"
-      ],
-      "metadata": {
-        "id": "syF3dTdKFSQQ"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "model = \"qvv0xeq\"\n",
-        "response = completion(model=model, messages=messages, custom_llm_provider=\"baseten\")\n",
-        "response"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "rPgSoMlsojz0",
-        "outputId": "81d6dc7b-1681-4ae4-e4c8-5684eb1bd050"
-      },
-      "execution_count": 18,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "\u001b[32mINFO\u001b[0m API key set.\n",
-            "INFO:baseten:API key set.\n"
-          ]
-        },
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "{'choices': [{'finish_reason': 'stop',\n",
-              "   'index': 0,\n",
-              "   'message': {'role': 'assistant',\n",
-              "    'content': \"what does Baseten do? \\nI'm sorry, I cannot provide a specific answer as\"}}],\n",
-              " 'created': 1692135883.699066,\n",
-              " 'model': 'qvv0xeq'}"
-            ]
-          },
-          "metadata": {},
-          "execution_count": 18
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## Calling Wizard LM https://app.baseten.co/explore/wizardlm\n",
-        "### Pass Your Baseten model `Version ID` as `model`"
-      ],
-      "metadata": {
-        "id": "7n21UroEGCGa"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "model = \"q841o8w\"\n",
-        "response = completion(model=model, messages=messages, custom_llm_provider=\"baseten\")\n",
-        "response"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "uLVWFH899lAF",
-        "outputId": "61c2bc74-673b-413e-bb40-179cf408523d"
-      },
-      "execution_count": 19,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "\u001b[32mINFO\u001b[0m API key set.\n",
-            "INFO:baseten:API key set.\n"
-          ]
-        },
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "{'choices': [{'finish_reason': 'stop',\n",
-              "   'index': 0,\n",
-              "   'message': {'role': 'assistant',\n",
-              "    'content': 'As an AI language model, I do not have personal beliefs or practices, but based on the information available online, Baseten is a popular name for a traditional Ethiopian dish made with injera, a spongy flatbread, and wat, a spicy stew made with meat or vegetables. It is typically served for breakfast or dinner and is a staple in Ethiopian cuisine. The name Baseten is also used to refer to a traditional Ethiopian coffee ceremony, where coffee is brewed and served in a special ceremony with music and food.'}}],\n",
-              " 'created': 1692135900.2806294,\n",
-              " 'model': 'q841o8w'}"
-            ]
-          },
-          "metadata": {},
-          "execution_count": 19
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## Calling mosaicml/mpt-7b https://app.baseten.co/explore/mpt_7b_instruct\n",
-        "### Pass Your Baseten model `Version ID` as `model`"
-      ],
-      "metadata": {
-        "id": "6-TFwmPAGPXq"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "model = \"31dxrj3\"\n",
-        "response = completion(model=model, messages=messages, custom_llm_provider=\"baseten\")\n",
-        "response"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "gbeYZOrUE_Bp",
-        "outputId": "838d86ea-2143-4cb3-bc80-2acc2346c37a"
-      },
-      "execution_count": 20,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "\u001b[32mINFO\u001b[0m API key set.\n",
-            "INFO:baseten:API key set.\n"
-          ]
-        },
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "{'choices': [{'finish_reason': 'stop',\n",
-              "   'index': 0,\n",
-              "   'message': {'role': 'assistant',\n",
-              "    'content': \"\\n===================\\n\\nIt's a tool to build a local version of a game on your own machine to host\\non your website.\\n\\nIt's used to make game demos and show them on Twitter, Tumblr, and Facebook.\\n\\n\\n\\n## What's built\\n\\n- A directory of all your game directories, named with a version name and build number, with images linked to.\\n- Includes HTML to include in another site.\\n- Includes images for your icons and\"}}],\n",
-              " 'created': 1692135914.7472186,\n",
-              " 'model': '31dxrj3'}"
-            ]
-          },
-          "metadata": {},
-          "execution_count": 20
-        }
+     "data": {
+      "text/plain": [
+       "{'choices': [{'finish_reason': 'stop',\n",
+       "   'index': 0,\n",
+       "   'message': {'role': 'assistant',\n",
+       "    'content': \"what does Baseten do? \\nI'm sorry, I cannot provide a specific answer as\"}}],\n",
+       " 'created': 1692135883.699066,\n",
+       " 'model': 'qvv0xeq'}"
      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
    }
-  ]
+   ],
+   "source": [
+    "model = \"qvv0xeq\"\n",
+    "response = completion(model=model, messages=messages, custom_llm_provider=\"baseten\")\n",
+    "response"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "7n21UroEGCGa"
+   },
+   "source": [
+    "## Calling Wizard LM https://app.baseten.co/explore/wizardlm\n",
+    "### Pass Your Baseten model `Version ID` as `model`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "uLVWFH899lAF",
+    "outputId": "61c2bc74-673b-413e-bb40-179cf408523d"
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32mINFO\u001b[0m API key set.\n",
+      "INFO:baseten:API key set.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'choices': [{'finish_reason': 'stop',\n",
+       "   'index': 0,\n",
+       "   'message': {'role': 'assistant',\n",
+       "    'content': 'As an AI language model, I do not have personal beliefs or practices, but based on the information available online, Baseten is a popular name for a traditional Ethiopian dish made with injera, a spongy flatbread, and wat, a spicy stew made with meat or vegetables. It is typically served for breakfast or dinner and is a staple in Ethiopian cuisine. The name Baseten is also used to refer to a traditional Ethiopian coffee ceremony, where coffee is brewed and served in a special ceremony with music and food.'}}],\n",
+       " 'created': 1692135900.2806294,\n",
+       " 'model': 'q841o8w'}"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model = \"q841o8w\"\n",
+    "response = completion(model=model, messages=messages, custom_llm_provider=\"baseten\")\n",
+    "response"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "6-TFwmPAGPXq"
+   },
+   "source": [
+    "## Calling mosaicml/mpt-7b https://app.baseten.co/explore/mpt_7b_instruct\n",
+    "### Pass Your Baseten model `Version ID` as `model`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "gbeYZOrUE_Bp",
+    "outputId": "838d86ea-2143-4cb3-bc80-2acc2346c37a"
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32mINFO\u001b[0m API key set.\n",
+      "INFO:baseten:API key set.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'choices': [{'finish_reason': 'stop',\n",
+       "   'index': 0,\n",
+       "   'message': {'role': 'assistant',\n",
+       "    'content': \"\\n===================\\n\\nIt's a tool to build a local version of a game on your own machine to host\\non your website.\\n\\nIt's used to make game demos and show them on Twitter, Tumblr, and Facebook.\\n\\n\\n\\n## What's built\\n\\n- A directory of all your game directories, named with a version name and build number, with images linked to.\\n- Includes HTML to include in another site.\\n- Includes images for your icons and\"}}],\n",
+       " 'created': 1692135914.7472186,\n",
+       " 'model': '31dxrj3'}"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model = \"31dxrj3\"\n",
+    "response = completion(model=model, messages=messages, custom_llm_provider=\"baseten\")\n",
+    "response"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
 }
--- a/cookbook/liteLLM_Langchain_Demo.ipynb
+++ b/cookbook/liteLLM_Langchain_Demo.ipynb
@ -1,201 +1,195 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    }
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "5hwntUxTMxEk"
+   },
+   "source": [
+    "# Langchain liteLLM Demo Notebook\n",
+    "## Use `ChatLiteLLM()` to instantly support 50+ LLM models\n",
+    "Langchain Docs: https://python.langchain.com/docs/integrations/chat/litellm\n",
+    "\n",
+    "Call all LLM models using the same I/O interface\n",
+    "\n",
+    "Example usage\n",
+    "```python\n",
+    "ChatLiteLLM(model=\"gpt-3.5-turbo\")\n",
+    "ChatLiteLLM(model=\"claude-2\", temperature=0.3)\n",
+    "ChatLiteLLM(model=\"command-nightly\")\n",
+    "ChatLiteLLM(model=\"replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1\")\n",
+    "```"
+   ]
  },
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "source": [
-        "# Langchain liteLLM Demo Notebook\n",
-        "## Use `ChatLiteLLM()` to instantly support 50+ LLM models\n",
-        "Langchain Docs: https://python.langchain.com/docs/integrations/chat/litellm\n",
-        "\n",
-        "Call all LLM models using the same I/O interface\n",
-        "\n",
-        "Example usage\n",
-        "```python\n",
-        "ChatLiteLLM(model=\"gpt-3.5-turbo\")\n",
-        "ChatLiteLLM(model=\"claude-2\", temperature=0.3)\n",
-        "ChatLiteLLM(model=\"command-nightly\")\n",
-        "ChatLiteLLM(model=\"replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1\")\n",
-        "```"
-      ],
-      "metadata": {
-        "id": "5hwntUxTMxEk"
-      }
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "aPNAUsCvB6Sv"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install litellm langchain"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "id": "MOhRaVnhB-0J"
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from langchain.chat_models import ChatLiteLLM\n",
+    "from langchain.schema import HumanMessage"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
    },
+    "id": "TahkCtlmCD65",
+    "outputId": "5ddda40f-f252-4830-a8d6-bd3fa68ae487"
+   },
+   "outputs": [
    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "aPNAUsCvB6Sv"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install litellm langchain"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "import os\n",
-        "from langchain.chat_models import ChatLiteLLM\n",
-        "from langchain.prompts.chat import (\n",
-        "    ChatPromptTemplate,\n",
-        "    SystemMessagePromptTemplate,\n",
-        "    AIMessagePromptTemplate,\n",
-        "    HumanMessagePromptTemplate,\n",
-        ")\n",
-        "from langchain.schema import AIMessage, HumanMessage, SystemMessage"
-      ],
-      "metadata": {
-        "id": "MOhRaVnhB-0J"
-      },
-      "execution_count": 2,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "os.environ['OPENAI_API_KEY'] = \"\"\n",
-        "chat = ChatLiteLLM(model=\"gpt-3.5-turbo\")\n",
-        "messages = [\n",
-        "    HumanMessage(\n",
-        "        content=\"what model are you\"\n",
-        "    )\n",
-        "]\n",
-        "chat(messages)"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "TahkCtlmCD65",
-        "outputId": "5ddda40f-f252-4830-a8d6-bd3fa68ae487"
-      },
-      "execution_count": 17,
-      "outputs": [
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "AIMessage(content='I am an AI model known as GPT-3, developed by OpenAI.', additional_kwargs={}, example=False)"
-            ]
-          },
-          "metadata": {},
-          "execution_count": 17
-        }
-      ]
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "os.environ['ANTHROPIC_API_KEY'] = \"\"\n",
-        "chat = ChatLiteLLM(model=\"claude-2\", temperature=0.3)\n",
-        "messages = [\n",
-        "    HumanMessage(\n",
-        "        content=\"what model are you\"\n",
-        "    )\n",
-        "]\n",
-        "chat(messages)"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "uXNDyU4jChcs",
-        "outputId": "bd74b4c6-f9fb-42dc-fdc3-9240d50503ba"
-      },
-      "execution_count": 23,
-      "outputs": [
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "AIMessage(content=\" I'm Claude, an AI assistant created by Anthropic.\", additional_kwargs={}, example=False)"
-            ]
-          },
-          "metadata": {},
-          "execution_count": 23
-        }
-      ]
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "os.environ['REPLICATE_API_TOKEN'] = \"\"\n",
-        "chat = ChatLiteLLM(model=\"replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1\")\n",
-        "messages = [\n",
-        "    HumanMessage(\n",
-        "        content=\"what model are you?\"\n",
-        "    )\n",
-        "]\n",
-        "chat(messages)"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "czbDJRKcC7BV",
-        "outputId": "892e147d-831e-4884-dc71-040f92c3fb8e"
-      },
-      "execution_count": 27,
-      "outputs": [
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "AIMessage(content=\" I'm an AI based based on LLaMA models (LLaMA: Open and Efficient Foundation Language Models, Touvron et al. 2023), my knowledge was built from a massive corpus of text, including books, articles, and websites, and I was trained using a variety of machine learning algorithms. My model architecture is based on the transformer architecture, which is particularly well-suited for natural language processing tasks. My team of developers and I are constantly working to improve and fine-tune my performance, and I am always happy to help with any questions you may have!\", additional_kwargs={}, example=False)"
-            ]
-          },
-          "metadata": {},
-          "execution_count": 27
-        }
-      ]
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "os.environ['COHERE_API_KEY'] = \"\"\n",
-        "chat = ChatLiteLLM(model=\"command-nightly\")\n",
-        "messages = [\n",
-        "    HumanMessage(\n",
-        "        content=\"what model are you?\"\n",
-        "    )\n",
-        "]\n",
-        "chat(messages)"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "tZxpq5PDDY9Y",
-        "outputId": "7e86f4ed-ac7a-45e1-87d0-217da6cad666"
-      },
-      "execution_count": 30,
-      "outputs": [
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "AIMessage(content=' I am an AI-based large language model, or Chatbot, built by the company Cohere. I am designed to have polite, helpful, inclusive conversations with users. I am always learning and improving, and I am constantly being updated with new information and improvements.\\n\\nI am currently in the development phase, and I am not yet available to the general public. However, I am currently being used by a select group of users for testing and feedback.\\n\\nI am a large language model, which means that I am trained on a massive amount of data and can understand and respond to a wide range of requests and questions. I am also designed to be flexible and adaptable, so I can be customized to suit the needs of different users and use cases.\\n\\nI am currently being used to develop a range of applications, including customer service chatbots, content generation tools, and language translation services. I am also being used to train other language models and to develop new ways of using large language models.\\n\\nI am constantly being updated with new information and improvements, so I am always learning and improving. I am also being used to develop new ways of using large language models, so I am always evolving and adapting to new use cases and requirements.', additional_kwargs={}, example=False)"
-            ]
-          },
-          "metadata": {},
-          "execution_count": 30
-        }
+     "data": {
+      "text/plain": [
+       "AIMessage(content='I am an AI model known as GPT-3, developed by OpenAI.', additional_kwargs={}, example=False)"
      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
    }
-  ]
+   ],
+   "source": [
+    "os.environ['OPENAI_API_KEY'] = \"\"\n",
+    "chat = ChatLiteLLM(model=\"gpt-3.5-turbo\")\n",
+    "messages = [\n",
+    "    HumanMessage(\n",
+    "        content=\"what model are you\"\n",
+    "    )\n",
+    "]\n",
+    "chat(messages)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "uXNDyU4jChcs",
+    "outputId": "bd74b4c6-f9fb-42dc-fdc3-9240d50503ba"
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "AIMessage(content=\" I'm Claude, an AI assistant created by Anthropic.\", additional_kwargs={}, example=False)"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "os.environ['ANTHROPIC_API_KEY'] = \"\"\n",
+    "chat = ChatLiteLLM(model=\"claude-2\", temperature=0.3)\n",
+    "messages = [\n",
+    "    HumanMessage(\n",
+    "        content=\"what model are you\"\n",
+    "    )\n",
+    "]\n",
+    "chat(messages)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "czbDJRKcC7BV",
+    "outputId": "892e147d-831e-4884-dc71-040f92c3fb8e"
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "AIMessage(content=\" I'm an AI based based on LLaMA models (LLaMA: Open and Efficient Foundation Language Models, Touvron et al. 2023), my knowledge was built from a massive corpus of text, including books, articles, and websites, and I was trained using a variety of machine learning algorithms. My model architecture is based on the transformer architecture, which is particularly well-suited for natural language processing tasks. My team of developers and I are constantly working to improve and fine-tune my performance, and I am always happy to help with any questions you may have!\", additional_kwargs={}, example=False)"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "os.environ['REPLICATE_API_TOKEN'] = \"\"\n",
+    "chat = ChatLiteLLM(model=\"replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1\")\n",
+    "messages = [\n",
+    "    HumanMessage(\n",
+    "        content=\"what model are you?\"\n",
+    "    )\n",
+    "]\n",
+    "chat(messages)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "tZxpq5PDDY9Y",
+    "outputId": "7e86f4ed-ac7a-45e1-87d0-217da6cad666"
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "AIMessage(content=' I am an AI-based large language model, or Chatbot, built by the company Cohere. I am designed to have polite, helpful, inclusive conversations with users. I am always learning and improving, and I am constantly being updated with new information and improvements.\\n\\nI am currently in the development phase, and I am not yet available to the general public. However, I am currently being used by a select group of users for testing and feedback.\\n\\nI am a large language model, which means that I am trained on a massive amount of data and can understand and respond to a wide range of requests and questions. I am also designed to be flexible and adaptable, so I can be customized to suit the needs of different users and use cases.\\n\\nI am currently being used to develop a range of applications, including customer service chatbots, content generation tools, and language translation services. I am also being used to train other language models and to develop new ways of using large language models.\\n\\nI am constantly being updated with new information and improvements, so I am always learning and improving. I am also being used to develop new ways of using large language models, so I am always evolving and adapting to new use cases and requirements.', additional_kwargs={}, example=False)"
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "os.environ['COHERE_API_KEY'] = \"\"\n",
+    "chat = ChatLiteLLM(model=\"command-nightly\")\n",
+    "messages = [\n",
+    "    HumanMessage(\n",
+    "        content=\"what model are you?\"\n",
+    "    )\n",
+    "]\n",
+    "chat(messages)"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
 }
--- a/cookbook/liteLLM_VertextAI_Example.ipynb
+++ b/cookbook/liteLLM_VertextAI_Example.ipynb
@ -43,7 +43,7 @@
   "source": [
    "# set you Vertex AI configs\n",
    "import litellm\n",
-    "from litellm import embedding, completion\n",
+    "from litellm import completion\n",
    "\n",
    "litellm.vertex_project = \"hardy-device-386718\"\n",
    "litellm.vertex_location = \"us-central1\""
--- a/cookbook/liteLLM_function_calling.ipynb
+++ b/cookbook/liteLLM_function_calling.ipynb
@ -1,331 +1,331 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    }
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "vnvlwUDZK7VA"
+   },
+   "source": [
+    "## Demo Notebook of Function Calling with liteLLM\n",
+    "- Supported Providers for Function Calling\n",
+    "  - OpenAI - `gpt-4-0613` and `gpt-3.5-turbo-0613`\n",
+    "- In this notebook we use function calling with `litellm.completion()`"
+   ]
  },
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## Demo Notebook of Function Calling with liteLLM\n",
-        "- Supported Providers for Function Calling\n",
-        "  - OpenAI - `gpt-4-0613` and `gpt-3.5-turbo-0613`\n",
-        "- In this notebook we use function calling with `litellm.completion()`"
-      ],
-      "metadata": {
-        "id": "vnvlwUDZK7VA"
-      }
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "KrINCwRfLgZV"
+   },
+   "outputs": [],
+   "source": [
+    "## Install liteLLM\n",
+    "!pip install litellm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "id": "nK7zR5OgLlh2"
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from litellm import completion"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {
+    "id": "dCQlyBxKLqbA"
+   },
+   "outputs": [],
+   "source": [
+    "os.environ['OPENAI_API_KEY'] = \"\" #@param"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "gfdGv-FMRCdX"
+   },
+   "source": [
+    "## Define Messages, Functions\n",
+    "We create a get_current_weather() function and pass that to GPT 3.5\n",
+    "\n",
+    "See OpenAI docs for this: https://openai.com/blog/function-calling-and-other-api-updates"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {
+    "id": "ERzsP1sfM19C"
+   },
+   "outputs": [],
+   "source": [
+    "messages = [\n",
+    "    {\"role\": \"user\", \"content\": \"What is the weather like in Boston?\"}\n",
+    "]\n",
+    "\n",
+    "def get_current_weather(location):\n",
+    "  if location == \"Boston, MA\":\n",
+    "    return \"The weather is 12F\"\n",
+    "\n",
+    "functions = [\n",
+    "    {\n",
+    "      \"name\": \"get_current_weather\",\n",
+    "      \"description\": \"Get the current weather in a given location\",\n",
+    "      \"parameters\": {\n",
+    "        \"type\": \"object\",\n",
+    "        \"properties\": {\n",
+    "          \"location\": {\n",
+    "            \"type\": \"string\",\n",
+    "            \"description\": \"The city and state, e.g. San Francisco, CA\"\n",
+    "          },\n",
+    "          \"unit\": {\n",
+    "            \"type\": \"string\",\n",
+    "            \"enum\": [\"celsius\", \"fahrenheit\"]\n",
+    "          }\n",
+    "        },\n",
+    "        \"required\": [\"location\"]\n",
+    "      }\n",
+    "    }\n",
+    "  ]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "NX6by2VuRPnp"
+   },
+   "source": [
+    "## Call gpt-3.5-turbo-0613 to Decide what Function to call"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
    },
+    "id": "QVoJ5PtxMlVx",
+    "outputId": "efe7a81f-e04a-4afc-aa60-a2b2648f5fb9"
+   },
+   "outputs": [
    {
-      "cell_type": "code",
-      "source": [
-        "## Install liteLLM\n",
-        "!pip install litellm"
-      ],
-      "metadata": {
-        "id": "KrINCwRfLgZV"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "import os, litellm\n",
-        "from litellm import completion"
-      ],
-      "metadata": {
-        "id": "nK7zR5OgLlh2"
-      },
-      "execution_count": 2,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "os.environ['OPENAI_API_KEY'] = \"\" #@param"
-      ],
-      "metadata": {
-        "id": "dCQlyBxKLqbA"
-      },
-      "execution_count": 27,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## Define Messages, Functions\n",
-        "We create a get_current_weather() function and pass that to GPT 3.5\n",
-        "\n",
-        "See OpenAI docs for this: https://openai.com/blog/function-calling-and-other-api-updates"
-      ],
-      "metadata": {
-        "id": "gfdGv-FMRCdX"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "messages = [\n",
-        "    {\"role\": \"user\", \"content\": \"What is the weather like in Boston?\"}\n",
-        "]\n",
-        "\n",
-        "def get_current_weather(location):\n",
-        "  if location == \"Boston, MA\":\n",
-        "    return \"The weather is 12F\"\n",
-        "\n",
-        "functions = [\n",
-        "    {\n",
-        "      \"name\": \"get_current_weather\",\n",
-        "      \"description\": \"Get the current weather in a given location\",\n",
-        "      \"parameters\": {\n",
-        "        \"type\": \"object\",\n",
-        "        \"properties\": {\n",
-        "          \"location\": {\n",
-        "            \"type\": \"string\",\n",
-        "            \"description\": \"The city and state, e.g. San Francisco, CA\"\n",
-        "          },\n",
-        "          \"unit\": {\n",
-        "            \"type\": \"string\",\n",
-        "            \"enum\": [\"celsius\", \"fahrenheit\"]\n",
-        "          }\n",
-        "        },\n",
-        "        \"required\": [\"location\"]\n",
-        "      }\n",
-        "    }\n",
-        "  ]"
-      ],
-      "metadata": {
-        "id": "ERzsP1sfM19C"
-      },
-      "execution_count": 25,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## Call gpt-3.5-turbo-0613 to Decide what Function to call"
-      ],
-      "metadata": {
-        "id": "NX6by2VuRPnp"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "response = completion(model=\"gpt-3.5-turbo-0613\", messages=messages, functions=functions)\n",
-        "print(response)"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "QVoJ5PtxMlVx",
-        "outputId": "efe7a81f-e04a-4afc-aa60-a2b2648f5fb9"
-      },
-      "execution_count": 9,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "{\n",
-            "  \"id\": \"chatcmpl-7mX4RiqdoislVEqfmfVjFSKp3hyIy\",\n",
-            "  \"object\": \"chat.completion\",\n",
-            "  \"created\": 1691801223,\n",
-            "  \"model\": \"gpt-3.5-turbo-0613\",\n",
-            "  \"choices\": [\n",
-            "    {\n",
-            "      \"index\": 0,\n",
-            "      \"message\": {\n",
-            "        \"role\": \"assistant\",\n",
-            "        \"content\": null,\n",
-            "        \"function_call\": {\n",
-            "          \"name\": \"get_current_weather\",\n",
-            "          \"arguments\": \"{\\n  \\\"location\\\": \\\"Boston, MA\\\"\\n}\"\n",
-            "        }\n",
-            "      },\n",
-            "      \"finish_reason\": \"function_call\"\n",
-            "    }\n",
-            "  ],\n",
-            "  \"usage\": {\n",
-            "    \"prompt_tokens\": 82,\n",
-            "    \"completion_tokens\": 18,\n",
-            "    \"total_tokens\": 100\n",
-            "  }\n",
-            "}\n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## Parse GPT 3.5 Response\n",
-        "Read Information about what Function to Call"
-      ],
-      "metadata": {
-        "id": "Yu0o2saDNLx8"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "function_call_data = response[\"choices\"][0][\"message\"][\"function_call\"]\n",
-        "function_call_data"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "u1DzXLJsNOR5",
-        "outputId": "177e9501-0ce2-4619-9067-3047f18f6c79"
-      },
-      "execution_count": 11,
-      "outputs": [
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "<OpenAIObject at 0x7922c70ce930> JSON: {\n",
-              "  \"name\": \"get_current_weather\",\n",
-              "  \"arguments\": \"{\\n  \\\"location\\\": \\\"Boston, MA\\\"\\n}\"\n",
-              "}"
-            ]
-          },
-          "metadata": {},
-          "execution_count": 11
-        }
-      ]
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "import json\n",
-        "function_name = function_call_data['name']\n",
-        "function_args = function_call_data['arguments']\n",
-        "function_args = json.loads(function_args)\n",
-        "print(function_name, function_args)\n"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "tYb96Mh0NhH9",
-        "outputId": "13c4bb89-6f29-4b3b-afa7-302dcf2cdd5f"
-      },
-      "execution_count": 20,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "get_current_weather {'location': 'Boston, MA'}\n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## Call the get_current_weather() function"
-      ],
-      "metadata": {
-        "id": "z3tstH_yN3fX"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "if function_name == \"get_current_weather\":\n",
-        "  result = get_current_weather(**function_args)\n",
-        "  print(result)"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "TSb8JHhgN5Zc",
-        "outputId": "ef140572-4020-4daf-ac8c-d5161be9aa5c"
-      },
-      "execution_count": 24,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "12F\n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## Send the response from get_current_weather back to the model to summarize"
-      ],
-      "metadata": {
-        "id": "k4HGJE3NRmMI"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "messages = [\n",
-        "    {\"role\": \"user\", \"content\": \"What is the weather like in Boston?\"},\n",
-        "    {\"role\": \"assistant\", \"content\": None, \"function_call\": {\"name\": \"get_current_weather\", \"arguments\": \"{ \\\"location\\\": \\\"Boston, MA\\\"}\"}},\n",
-        "    {\"role\": \"function\", \"name\": \"get_current_weather\", \"content\": result}\n",
-        "]\n",
-        "response = completion(model=\"gpt-3.5-turbo-0613\", messages=messages, functions=functions)\n",
-        "print(response)"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "a23cmEwiPaw7",
-        "outputId": "43259b86-0c4c-4fcb-eab7-6e1a788b2f21"
-      },
-      "execution_count": 26,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "{\n",
-            "  \"id\": \"chatcmpl-7mXGN62u75WXp1Lgen4iSgNvA7hHT\",\n",
-            "  \"object\": \"chat.completion\",\n",
-            "  \"created\": 1691801963,\n",
-            "  \"model\": \"gpt-3.5-turbo-0613\",\n",
-            "  \"choices\": [\n",
-            "    {\n",
-            "      \"index\": 0,\n",
-            "      \"message\": {\n",
-            "        \"role\": \"assistant\",\n",
-            "        \"content\": \"The current weather in Boston is 12 degrees Fahrenheit.\"\n",
-            "      },\n",
-            "      \"finish_reason\": \"stop\"\n",
-            "    }\n",
-            "  ],\n",
-            "  \"usage\": {\n",
-            "    \"prompt_tokens\": 109,\n",
-            "    \"completion_tokens\": 12,\n",
-            "    \"total_tokens\": 121\n",
-            "  }\n",
-            "}\n"
-          ]
-        }
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\n",
+      "  \"id\": \"chatcmpl-7mX4RiqdoislVEqfmfVjFSKp3hyIy\",\n",
+      "  \"object\": \"chat.completion\",\n",
+      "  \"created\": 1691801223,\n",
+      "  \"model\": \"gpt-3.5-turbo-0613\",\n",
+      "  \"choices\": [\n",
+      "    {\n",
+      "      \"index\": 0,\n",
+      "      \"message\": {\n",
+      "        \"role\": \"assistant\",\n",
+      "        \"content\": null,\n",
+      "        \"function_call\": {\n",
+      "          \"name\": \"get_current_weather\",\n",
+      "          \"arguments\": \"{\\n  \\\"location\\\": \\\"Boston, MA\\\"\\n}\"\n",
+      "        }\n",
+      "      },\n",
+      "      \"finish_reason\": \"function_call\"\n",
+      "    }\n",
+      "  ],\n",
+      "  \"usage\": {\n",
+      "    \"prompt_tokens\": 82,\n",
+      "    \"completion_tokens\": 18,\n",
+      "    \"total_tokens\": 100\n",
+      "  }\n",
+      "}\n"
+     ]
    }
-  ]
+   ],
+   "source": [
+    "response = completion(model=\"gpt-3.5-turbo-0613\", messages=messages, functions=functions)\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Yu0o2saDNLx8"
+   },
+   "source": [
+    "## Parse GPT 3.5 Response\n",
+    "Read Information about what Function to Call"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "u1DzXLJsNOR5",
+    "outputId": "177e9501-0ce2-4619-9067-3047f18f6c79"
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<OpenAIObject at 0x7922c70ce930> JSON: {\n",
+       "  \"name\": \"get_current_weather\",\n",
+       "  \"arguments\": \"{\\n  \\\"location\\\": \\\"Boston, MA\\\"\\n}\"\n",
+       "}"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "function_call_data = response[\"choices\"][0][\"message\"][\"function_call\"]\n",
+    "function_call_data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "tYb96Mh0NhH9",
+    "outputId": "13c4bb89-6f29-4b3b-afa7-302dcf2cdd5f"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "get_current_weather {'location': 'Boston, MA'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "import json\n",
+    "function_name = function_call_data['name']\n",
+    "function_args = function_call_data['arguments']\n",
+    "function_args = json.loads(function_args)\n",
+    "print(function_name, function_args)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "z3tstH_yN3fX"
+   },
+   "source": [
+    "## Call the get_current_weather() function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "TSb8JHhgN5Zc",
+    "outputId": "ef140572-4020-4daf-ac8c-d5161be9aa5c"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "12F\n"
+     ]
+    }
+   ],
+   "source": [
+    "if function_name == \"get_current_weather\":\n",
+    "  result = get_current_weather(**function_args)\n",
+    "  print(result)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "k4HGJE3NRmMI"
+   },
+   "source": [
+    "## Send the response from get_current_weather back to the model to summarize"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "a23cmEwiPaw7",
+    "outputId": "43259b86-0c4c-4fcb-eab7-6e1a788b2f21"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\n",
+      "  \"id\": \"chatcmpl-7mXGN62u75WXp1Lgen4iSgNvA7hHT\",\n",
+      "  \"object\": \"chat.completion\",\n",
+      "  \"created\": 1691801963,\n",
+      "  \"model\": \"gpt-3.5-turbo-0613\",\n",
+      "  \"choices\": [\n",
+      "    {\n",
+      "      \"index\": 0,\n",
+      "      \"message\": {\n",
+      "        \"role\": \"assistant\",\n",
+      "        \"content\": \"The current weather in Boston is 12 degrees Fahrenheit.\"\n",
+      "      },\n",
+      "      \"finish_reason\": \"stop\"\n",
+      "    }\n",
+      "  ],\n",
+      "  \"usage\": {\n",
+      "    \"prompt_tokens\": 109,\n",
+      "    \"completion_tokens\": 12,\n",
+      "    \"total_tokens\": 121\n",
+      "  }\n",
+      "}\n"
+     ]
+    }
+   ],
+   "source": [
+    "messages = [\n",
+    "    {\"role\": \"user\", \"content\": \"What is the weather like in Boston?\"},\n",
+    "    {\"role\": \"assistant\", \"content\": None, \"function_call\": {\"name\": \"get_current_weather\", \"arguments\": \"{ \\\"location\\\": \\\"Boston, MA\\\"}\"}},\n",
+    "    {\"role\": \"function\", \"name\": \"get_current_weather\", \"content\": result}\n",
+    "]\n",
+    "response = completion(model=\"gpt-3.5-turbo-0613\", messages=messages, functions=functions)\n",
+    "print(response)"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
 }
--- a/cookbook/litellm-ollama-docker-image/requirements.txt
+++ b/cookbook/litellm-ollama-docker-image/requirements.txt
@ -1 +1 @@
-litellm
+litellm==1.55.3
--- a/cookbook/litellm-ollama-docker-image/test.py
+++ b/cookbook/litellm-ollama-docker-image/test.py
@ -1,13 +1,13 @@
 import openai

-api_base = f"http://0.0.0.0:8000"
+api_base = "http://0.0.0.0:8000"

 openai.api_base = api_base
 openai.api_key = "temp-key"
 print(openai.api_base)


-print(f"LiteLLM: response from proxy with streaming")
+print("LiteLLM: response from proxy with streaming")
 response = openai.ChatCompletion.create(
    model="ollama/llama2",
    messages=[
--- a/cookbook/litellm_Test_Multiple_Providers.ipynb
+++ b/cookbook/litellm_Test_Multiple_Providers.ipynb
--- a/cookbook/litellm_model_fallback.ipynb
+++ b/cookbook/litellm_model_fallback.ipynb
@ -1,52 +1,51 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    }
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "j6yJsCGeaq8G"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install litellm"
+   ]
  },
-  "cells": [
-    {
-      "cell_type": "code",
-      "source": [
-        "!pip install litellm"
-      ],
-      "metadata": {
-        "id": "j6yJsCGeaq8G"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "u129iWNPaf72"
-      },
-      "outputs": [],
-      "source": [
-        "import litellm\n",
-        "from litellm import embedding, completion\n",
-        "\n",
-        "model_fallback_list = [\"claude-instant-1\", \"gpt-3.5-turbo\", \"chatgpt-test\"]\n",
-        "\n",
-        "user_message = \"Hello, how are you?\"\n",
-        "messages = [{ \"content\": user_message,\"role\": \"user\"}]\n",
-        "\n",
-        "for model in model_fallback_list:\n",
-        "  try:\n",
-        "      response = completion(model=model, messages=messages)\n",
-        "  except Exception as e:\n",
-        "      print(f\"error occurred: {traceback.format_exc()}\")"
-      ]
-    }
-  ]
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "u129iWNPaf72"
+   },
+   "outputs": [],
+   "source": [
+    "from litellm import completion\n",
+    "\n",
+    "model_fallback_list = [\"claude-instant-1\", \"gpt-3.5-turbo\", \"chatgpt-test\"]\n",
+    "\n",
+    "user_message = \"Hello, how are you?\"\n",
+    "messages = [{ \"content\": user_message,\"role\": \"user\"}]\n",
+    "\n",
+    "for model in model_fallback_list:\n",
+    "  try:\n",
+    "      response = completion(model=model, messages=messages)\n",
+    "  except Exception:\n",
+    "      print(f\"error occurred: {traceback.format_exc()}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
 }
--- a/cookbook/litellm_router/load_test_proxy.py
+++ b/cookbook/litellm_router/load_test_proxy.py
@ -1,14 +1,12 @@
-import sys, os
-import traceback
+import sys
+import os
 from dotenv import load_dotenv

 load_dotenv()
-import os, io

 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
-import pytest

 from litellm import Router
 import litellm
@ -137,7 +135,7 @@ for future in futures:
    else:
        failed_calls += 1

-print(f"Load test Summary:")
+print("Load test Summary:")
 print(f"Total Requests: {concurrent_calls}")
 print(f"Successful Calls: {successful_calls}")
 print(f"Failed Calls: {failed_calls}")
--- a/cookbook/litellm_router/load_test_queuing.py
+++ b/cookbook/litellm_router/load_test_queuing.py
@ -1,14 +1,12 @@
-import sys, os
-import traceback
+import sys
+import os
 from dotenv import load_dotenv

 load_dotenv()
-import os, io

 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
-import pytest

 from litellm import Router
 import litellm
@ -160,7 +158,7 @@ for future in futures:
        else:
            failed_calls += 1

-print(f"Load test Summary:")
+print("Load test Summary:")
 print(f"Total Requests: {concurrent_calls}")
 print(f"Successful Calls: {successful_calls}")
 print(f"Failed Calls: {failed_calls}")
--- a/cookbook/litellm_router/load_test_router.py
+++ b/cookbook/litellm_router/load_test_router.py
@ -1,14 +1,12 @@
-import sys, os
-import traceback
+import sys
+import os
 from dotenv import load_dotenv

 load_dotenv()
-import os, io

 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
-import pytest

 from litellm import Router
 import litellm
@ -132,7 +130,7 @@ for future in futures:
    else:
        failed_calls += 1

-print(f"Load test Summary:")
+print("Load test Summary:")
 print(f"Total Requests: {concurrent_calls}")
 print(f"Successful Calls: {successful_calls}")
 print(f"Failed Calls: {failed_calls}")
--- a/cookbook/litellm_router_load_test/memory_usage/router_endpoint.py
+++ b/cookbook/litellm_router_load_test/memory_usage/router_endpoint.py
@ -1,14 +1,9 @@
 from fastapi import FastAPI
 import uvicorn
-from memory_profiler import profile, memory_usage
+from memory_profiler import profile
 import os
-import traceback
-import asyncio
-import pytest
 import litellm
 from litellm import Router
-from concurrent.futures import ThreadPoolExecutor
-from collections import defaultdict
 from dotenv import load_dotenv
 import uuid

--- a/cookbook/litellm_router_load_test/memory_usage/router_memory_usage
+++ b/cookbook/litellm_router_load_test/memory_usage/router_memory_usage
@ -1,17 +1,16 @@
 #### What this tests ####

-from memory_profiler import profile, memory_usage
-import sys, os, time
-import traceback, asyncio
-import pytest
+from memory_profiler import profile
+import sys
+import os
+import time
+import asyncio

 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import litellm
 from litellm import Router
-from concurrent.futures import ThreadPoolExecutor
-from collections import defaultdict
 from dotenv import load_dotenv
 import uuid

--- a/cookbook/litellm_router_load_test/memory_usage/router_memory_usage.py
+++ b/cookbook/litellm_router_load_test/memory_usage/router_memory_usage.py
@ -1,17 +1,16 @@
 #### What this tests ####

-from memory_profiler import profile, memory_usage
-import sys, os, time
-import traceback, asyncio
-import pytest
+from memory_profiler import profile
+import sys
+import os
+import time
+import asyncio

 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import litellm
 from litellm import Router
-from concurrent.futures import ThreadPoolExecutor
-from collections import defaultdict
 from dotenv import load_dotenv
 import uuid

--- a/cookbook/litellm_router_load_test/test_loadtest_openai_client.py
+++ b/cookbook/litellm_router_load_test/test_loadtest_openai_client.py
@ -1,17 +1,14 @@
-import sys, os
-import traceback
+import sys
+import os
 from dotenv import load_dotenv
-import copy

 load_dotenv()
 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import asyncio
-from litellm import Router, Timeout
+from litellm import Timeout
 import time
-from litellm.caching.caching import Cache
-import litellm
 import openai

 ### Test just calling AsyncAzureOpenAI
--- a/cookbook/litellm_router_load_test/test_loadtest_router.py
+++ b/cookbook/litellm_router_load_test/test_loadtest_router.py
@ -1,7 +1,6 @@
-import sys, os
-import traceback
+import sys
+import os
 from dotenv import load_dotenv
-import copy

 load_dotenv()
 sys.path.insert(
--- a/cookbook/litellm_router_load_test/test_loadtest_router_withs3_cache.py
+++ b/cookbook/litellm_router_load_test/test_loadtest_router_withs3_cache.py
@ -1,7 +1,6 @@
-import sys, os
-import traceback
+import sys
+import os
 from dotenv import load_dotenv
-import copy

 load_dotenv()
 sys.path.insert(
--- a/cookbook/logging_observability/LiteLLM_Arize.ipynb
+++ b/cookbook/logging_observability/LiteLLM_Arize.ipynb
@ -0,0 +1,172 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "4FbDOmcj2VkM"
+      },
+      "source": [
+        "## Use LiteLLM with Arize\n",
+        "https://docs.litellm.ai/docs/observability/arize_integration\n",
+        "\n",
+        "This method uses the litellm proxy to send the data to Arize. The callback is set in the litellm config below, instead of using OpenInference tracing."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "21W8Woog26Ns"
+      },
+      "source": [
+        "## Install Dependencies"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "id": "xrjKLBxhxu2L"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Requirement already satisfied: litellm in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (1.54.1)\n",
+            "Requirement already satisfied: aiohttp in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (3.11.10)\n",
+            "Requirement already satisfied: click in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (8.1.7)\n",
+            "Requirement already satisfied: httpx<0.28.0,>=0.23.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (0.27.2)\n",
+            "Requirement already satisfied: importlib-metadata>=6.8.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (8.5.0)\n",
+            "Requirement already satisfied: jinja2<4.0.0,>=3.1.2 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (3.1.4)\n",
+            "Requirement already satisfied: jsonschema<5.0.0,>=4.22.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (4.23.0)\n",
+            "Requirement already satisfied: openai>=1.55.3 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (1.57.1)\n",
+            "Requirement already satisfied: pydantic<3.0.0,>=2.0.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (2.10.3)\n",
+            "Requirement already satisfied: python-dotenv>=0.2.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (1.0.1)\n",
+            "Requirement already satisfied: requests<3.0.0,>=2.31.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (2.32.3)\n",
+            "Requirement already satisfied: tiktoken>=0.7.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (0.7.0)\n",
+            "Requirement already satisfied: tokenizers in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (0.21.0)\n",
+            "Requirement already satisfied: anyio in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from httpx<0.28.0,>=0.23.0->litellm) (4.7.0)\n",
+            "Requirement already satisfied: certifi in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from httpx<0.28.0,>=0.23.0->litellm) (2024.8.30)\n",
+            "Requirement already satisfied: httpcore==1.* in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from httpx<0.28.0,>=0.23.0->litellm) (1.0.7)\n",
+            "Requirement already satisfied: idna in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from httpx<0.28.0,>=0.23.0->litellm) (3.10)\n",
+            "Requirement already satisfied: sniffio in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from httpx<0.28.0,>=0.23.0->litellm) (1.3.1)\n",
+            "Requirement already satisfied: h11<0.15,>=0.13 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from httpcore==1.*->httpx<0.28.0,>=0.23.0->litellm) (0.14.0)\n",
+            "Requirement already satisfied: zipp>=3.20 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from importlib-metadata>=6.8.0->litellm) (3.21.0)\n",
+            "Requirement already satisfied: MarkupSafe>=2.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from jinja2<4.0.0,>=3.1.2->litellm) (3.0.2)\n",
+            "Requirement already satisfied: attrs>=22.2.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm) (24.2.0)\n",
+            "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm) (2024.10.1)\n",
+            "Requirement already satisfied: referencing>=0.28.4 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm) (0.35.1)\n",
+            "Requirement already satisfied: rpds-py>=0.7.1 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm) (0.22.3)\n",
+            "Requirement already satisfied: distro<2,>=1.7.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from openai>=1.55.3->litellm) (1.9.0)\n",
+            "Requirement already satisfied: jiter<1,>=0.4.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from openai>=1.55.3->litellm) (0.6.1)\n",
+            "Requirement already satisfied: tqdm>4 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from openai>=1.55.3->litellm) (4.67.1)\n",
+            "Requirement already satisfied: typing-extensions<5,>=4.11 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from openai>=1.55.3->litellm) (4.12.2)\n",
+            "Requirement already satisfied: annotated-types>=0.6.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from pydantic<3.0.0,>=2.0.0->litellm) (0.7.0)\n",
+            "Requirement already satisfied: pydantic-core==2.27.1 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from pydantic<3.0.0,>=2.0.0->litellm) (2.27.1)\n",
+            "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from requests<3.0.0,>=2.31.0->litellm) (3.4.0)\n",
+            "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from requests<3.0.0,>=2.31.0->litellm) (2.0.7)\n",
+            "Requirement already satisfied: regex>=2022.1.18 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from tiktoken>=0.7.0->litellm) (2024.11.6)\n",
+            "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from aiohttp->litellm) (2.4.4)\n",
+            "Requirement already satisfied: aiosignal>=1.1.2 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from aiohttp->litellm) (1.3.1)\n",
+            "Requirement already satisfied: frozenlist>=1.1.1 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from aiohttp->litellm) (1.5.0)\n",
+            "Requirement already satisfied: multidict<7.0,>=4.5 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from aiohttp->litellm) (6.1.0)\n",
+            "Requirement already satisfied: propcache>=0.2.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from aiohttp->litellm) (0.2.1)\n",
+            "Requirement already satisfied: yarl<2.0,>=1.17.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from aiohttp->litellm) (1.18.3)\n",
+            "Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from tokenizers->litellm) (0.26.5)\n",
+            "Requirement already satisfied: filelock in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm) (3.16.1)\n",
+            "Requirement already satisfied: fsspec>=2023.5.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm) (2024.10.0)\n",
+            "Requirement already satisfied: packaging>=20.9 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm) (24.2)\n",
+            "Requirement already satisfied: pyyaml>=5.1 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm) (6.0.2)\n"
+          ]
+        }
+      ],
+      "source": [
+        "!pip install litellm"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "jHEu-TjZ29PJ"
+      },
+      "source": [
+        "## Set Env Variables"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "id": "QWd9rTysxsWO"
+      },
+      "outputs": [],
+      "source": [
+        "import litellm\n",
+        "import os\n",
+        "from getpass import getpass\n",
+        "\n",
+        "os.environ[\"ARIZE_SPACE_KEY\"] = getpass(\"Enter your Arize space key: \")\n",
+        "os.environ[\"ARIZE_API_KEY\"] = getpass(\"Enter your Arize API key: \")\n",
+        "os.environ['OPENAI_API_KEY']= getpass(\"Enter your OpenAI API key: \")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Let's run a completion call and see the traces in Arize"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Hello! Nice to meet you, OpenAI. How can I assist you today?\n"
+          ]
+        }
+      ],
+      "source": [
+        "# set arize as a callback, litellm will send the data to arize\n",
+        "litellm.callbacks = [\"arize\"]\n",
+        " \n",
+        "# openai call\n",
+        "response = litellm.completion(\n",
+        "  model=\"gpt-3.5-turbo\",\n",
+        "  messages=[\n",
+        "    {\"role\": \"user\", \"content\": \"Hi 👋 - i'm openai\"}\n",
+        "  ]\n",
+        ")\n",
+        "print(response.choices[0].message.content)"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": ".venv",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.11.6"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
--- a/cookbook/logging_observability/LiteLLM_Proxy_Langfuse.ipynb
+++ b/cookbook/logging_observability/LiteLLM_Proxy_Langfuse.ipynb
@ -0,0 +1,252 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## LLM Ops Stack  - LiteLLM Proxy + Langfuse \n",
+    "\n",
+    "This notebook demonstrates how to use LiteLLM Proxy with Langfuse \n",
+    "- Use LiteLLM Proxy for calling 100+ LLMs in OpenAI format\n",
+    "- Use Langfuse for viewing request / response traces \n",
+    "\n",
+    "\n",
+    "In this notebook we will setup LiteLLM Proxy to make requests to OpenAI, Anthropic, Bedrock and automatically log traces to Langfuse."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Setup LiteLLM Proxy\n",
+    "\n",
+    "### 1.1 Define .env variables \n",
+    "Define .env variables on the container that litellm proxy is running on.\n",
+    "```bash\n",
+    "## LLM API Keys\n",
+    "OPENAI_API_KEY=sk-proj-1234567890\n",
+    "ANTHROPIC_API_KEY=sk-ant-api03-1234567890\n",
+    "AWS_ACCESS_KEY_ID=1234567890\n",
+    "AWS_SECRET_ACCESS_KEY=1234567890\n",
+    "\n",
+    "## Langfuse Logging \n",
+    "LANGFUSE_PUBLIC_KEY=\"pk-lf-xxxx9\"\n",
+    "LANGFUSE_SECRET_KEY=\"sk-lf-xxxx9\"\n",
+    "LANGFUSE_HOST=\"https://us.cloud.langfuse.com\"\n",
+    "```\n",
+    "\n",
+    "\n",
+    "### 1.1 Setup LiteLLM Proxy Config yaml \n",
+    "```yaml\n",
+    "model_list:\n",
+    "  - model_name: gpt-4o\n",
+    "    litellm_params:\n",
+    "      model: openai/gpt-4o\n",
+    "      api_key: os.environ/OPENAI_API_KEY\n",
+    "  - model_name: claude-3-5-sonnet-20241022\n",
+    "    litellm_params:\n",
+    "      model: anthropic/claude-3-5-sonnet-20241022\n",
+    "      api_key: os.environ/ANTHROPIC_API_KEY\n",
+    "  - model_name: us.amazon.nova-micro-v1:0\n",
+    "    litellm_params:\n",
+    "      model: bedrock/us.amazon.nova-micro-v1:0\n",
+    "      aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID\n",
+    "      aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY\n",
+    "\n",
+    "litellm_settings:\n",
+    "  callbacks: [\"langfuse\"]\n",
+    "\n",
+    "\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Make LLM Requests to LiteLLM Proxy\n",
+    "\n",
+    "Now we will make our first LLM request to LiteLLM Proxy"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.1 Setup Client Side Variables to point to LiteLLM Proxy\n",
+    "Set `LITELLM_PROXY_BASE_URL` to the base url of the LiteLLM Proxy and `LITELLM_VIRTUAL_KEY` to the virtual key you want to use for Authentication to LiteLLM Proxy. (Note: In this initial setup you can)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "LITELLM_PROXY_BASE_URL=\"http://0.0.0.0:4000\"\n",
+    "LITELLM_VIRTUAL_KEY=\"sk-oXXRa1xxxxxxxxxxx\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "ChatCompletion(id='chatcmpl-B0sq6QkOKNMJ0dwP3x7OoMqk1jZcI', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Langfuse is a platform designed to monitor, observe, and troubleshoot AI and large language model (LLM) applications. It provides features that help developers gain insights into how their AI systems are performing, make debugging easier, and optimize the deployment of models. Langfuse allows for tracking of model interactions, collecting telemetry, and visualizing data, which is crucial for understanding the behavior of AI models in production environments. This kind of tool is particularly useful for developers working with language models who need to ensure reliability and efficiency in their applications.', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1739550502, model='gpt-4o-2024-08-06', object='chat.completion', service_tier='default', system_fingerprint='fp_523b9b6e5f', usage=CompletionUsage(completion_tokens=109, prompt_tokens=13, total_tokens=122, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0)))"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import openai\n",
+    "client = openai.OpenAI(\n",
+    "    api_key=LITELLM_VIRTUAL_KEY,\n",
+    "    base_url=LITELLM_PROXY_BASE_URL\n",
+    ")\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"gpt-4o\",\n",
+    "    messages = [\n",
+    "        {\n",
+    "            \"role\": \"user\",\n",
+    "            \"content\": \"what is Langfuse?\"\n",
+    "        }\n",
+    "    ],\n",
+    ")\n",
+    "\n",
+    "response"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.3 View Traces on Langfuse\n",
+    "LiteLLM will send the request / response, model, tokens (input + output), cost to Langfuse.\n",
+    "\n",
+    "![image_description](litellm_proxy_langfuse.png)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.4 Call Anthropic, Bedrock models \n",
+    "\n",
+    "Now we can call `us.amazon.nova-micro-v1:0` and `claude-3-5-sonnet-20241022` models defined on your config.yaml both in the OpenAI request / response format."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "ChatCompletion(id='chatcmpl-7756e509-e61f-4f5e-b5ae-b7a41013522a', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=\"Langfuse is an observability tool designed specifically for machine learning models and applications built with natural language processing (NLP) and large language models (LLMs). It focuses on providing detailed insights into how these models perform in real-world scenarios. Here are some key features and purposes of Langfuse:\\n\\n1. **Real-time Monitoring**: Langfuse allows developers to monitor the performance of their NLP and LLM applications in real time. This includes tracking the inputs and outputs of the models, as well as any errors or issues that arise during operation.\\n\\n2. **Error Tracking**: It helps in identifying and tracking errors in the models' outputs. By analyzing incorrect or unexpected responses, developers can pinpoint where and why errors occur, facilitating more effective debugging and improvement.\\n\\n3. **Performance Metrics**: Langfuse provides various performance metrics, such as latency, throughput, and error rates. These metrics help developers understand how well their models are performing under different conditions and workloads.\\n\\n4. **Traceability**: It offers detailed traceability of requests and responses, allowing developers to follow the path of a request through the system and see how it is processed by the model at each step.\\n\\n5. **User Feedback Integration**: Langfuse can integrate user feedback to provide context for model outputs. This helps in understanding how real users are interacting with the model and how its outputs align with user expectations.\\n\\n6. **Customizable Dashboards**: Users can create custom dashboards to visualize the data collected by Langfuse. These dashboards can be tailored to highlight the most important metrics and insights for a specific application or team.\\n\\n7. **Alerting and Notifications**: It can set up alerts for specific conditions or errors, notifying developers when something goes wrong or when performance metrics fall outside of acceptable ranges.\\n\\nBy providing comprehensive observability for NLP and LLM applications, Langfuse helps developers to build more reliable, accurate, and user-friendly models and services.\", refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1739554005, model='us.amazon.nova-micro-v1:0', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=380, prompt_tokens=5, total_tokens=385, completion_tokens_details=None, prompt_tokens_details=None))"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import openai\n",
+    "client = openai.OpenAI(\n",
+    "    api_key=LITELLM_VIRTUAL_KEY,\n",
+    "    base_url=LITELLM_PROXY_BASE_URL\n",
+    ")\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"us.amazon.nova-micro-v1:0\",\n",
+    "    messages = [\n",
+    "        {\n",
+    "            \"role\": \"user\",\n",
+    "            \"content\": \"what is Langfuse?\"\n",
+    "        }\n",
+    "    ],\n",
+    ")\n",
+    "\n",
+    "response"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Advanced - Set Langfuse Trace ID, Tags, Metadata \n",
+    "\n",
+    "Here is an example of how you can set Langfuse specific params on your client side request. See full list of supported langfuse params [here](https://docs.litellm.ai/docs/observability/langfuse_integration)\n",
+    "\n",
+    "You can view the logged trace of this request [here](https://us.cloud.langfuse.com/project/clvlhdfat0007vwb74m9lvfvi/traces/567890?timestamp=2025-02-14T17%3A30%3A26.709Z)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "ChatCompletion(id='chatcmpl-789babd5-c064-4939-9093-46e4cd2e208a', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=\"Langfuse is an observability platform designed specifically for monitoring and improving the performance of natural language processing (NLP) models and applications. It provides developers with tools to track, analyze, and optimize how their language models interact with users and handle natural language inputs.\\n\\nHere are some key features and benefits of Langfuse:\\n\\n1. **Real-Time Monitoring**: Langfuse allows developers to monitor their NLP applications in real time. This includes tracking user interactions, model responses, and overall performance metrics.\\n\\n2. **Error Tracking**: It helps in identifying and tracking errors in the model's responses. This can include incorrect, irrelevant, or unsafe outputs.\\n\\n3. **User Feedback Integration**: Langfuse enables the collection of user feedback directly within the platform. This feedback can be used to identify areas for improvement in the model's performance.\\n\\n4. **Performance Metrics**: The platform provides detailed metrics and analytics on model performance, including latency, throughput, and accuracy.\\n\\n5. **Alerts and Notifications**: Developers can set up alerts to notify them of any significant issues or anomalies in model performance.\\n\\n6. **Debugging Tools**: Langfuse offers tools to help developers debug and refine their models by providing insights into how the model processes different types of inputs.\\n\\n7. **Integration with Development Workflows**: It integrates seamlessly with various development environments and CI/CD pipelines, making it easier to incorporate observability into the development process.\\n\\n8. **Customizable Dashboards**: Users can create custom dashboards to visualize the data in a way that best suits their needs.\\n\\nLangfuse aims to help developers build more reliable, accurate, and user-friendly NLP applications by providing them with the tools to observe and improve how their models perform in real-world scenarios.\", refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1739554281, model='us.amazon.nova-micro-v1:0', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=346, prompt_tokens=5, total_tokens=351, completion_tokens_details=None, prompt_tokens_details=None))"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import openai\n",
+    "client = openai.OpenAI(\n",
+    "    api_key=LITELLM_VIRTUAL_KEY,\n",
+    "    base_url=LITELLM_PROXY_BASE_URL\n",
+    ")\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"us.amazon.nova-micro-v1:0\",\n",
+    "    messages = [\n",
+    "        {\n",
+    "            \"role\": \"user\",\n",
+    "            \"content\": \"what is Langfuse?\"\n",
+    "        }\n",
+    "    ],\n",
+    "    extra_body={\n",
+    "        \"metadata\": {\n",
+    "            \"generation_id\": \"1234567890\",\n",
+    "            \"trace_id\": \"567890\",\n",
+    "            \"trace_user_id\": \"user_1234567890\",\n",
+    "            \"tags\": [\"tag1\", \"tag2\"]\n",
+    "        }\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "response"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## "
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/cookbook/logging_observability/litellm_proxy_langfuse.png
+++ b/cookbook/logging_observability/litellm_proxy_langfuse.png
--- a/cookbook/misc/add_new_models.py
+++ b/cookbook/misc/add_new_models.py
@ -1,5 +1,4 @@
 import requests
-import json


 def get_initial_config():
--- a/cookbook/misc/migrate_proxy_config.py
+++ b/cookbook/misc/migrate_proxy_config.py
@ -36,7 +36,7 @@ def migrate_models(config_file, proxy_base_url):

        litellm_model_name = litellm_params.get("model", "") or ""
        if "vertex_ai/" in litellm_model_name:
-            print(f"\033[91m\nSkipping Vertex AI model\033[0m", model)
+            print("\033[91m\nSkipping Vertex AI model\033[0m", model)
            continue

        for param, value in litellm_params.items():
--- a/cookbook/misc/openai_timeouts.py
+++ b/cookbook/misc/openai_timeouts.py
@ -1,7 +1,6 @@
 import os
 from openai import OpenAI
 from dotenv import load_dotenv
-import httpx
 import concurrent.futures

 load_dotenv()
--- a/cookbook/misc/sagmaker_streaming.py
+++ b/cookbook/misc/sagmaker_streaming.py
@ -2,21 +2,16 @@
 import json
 import boto3

-import sys, os
-import traceback
+import sys
+import os
 from dotenv import load_dotenv

 load_dotenv()
-import os, io
+import io

 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
-import pytest
-import litellm
-
-import io
-import json


 class TokenIterator:
@ -48,7 +43,6 @@ payload = {
    "stream": True,
 }

-import boto3

 client = boto3.client("sagemaker-runtime", region_name="us-west-2")
 response = client.invoke_endpoint_with_response_stream(
--- a/cookbook/misc/update_json_caching.py
+++ b/cookbook/misc/update_json_caching.py
@ -0,0 +1,54 @@
+import json
+
+# List of models to update
+models_to_update = [
+    "gpt-4o-mini",
+    "gpt-4o-mini-2024-07-18",
+    "gpt-4o",
+    "gpt-4o-2024-11-20",
+    "gpt-4o-2024-08-06",
+    "gpt-4o-2024-05-13",
+    "text-embedding-3-small",
+    "text-embedding-3-large",
+    "text-embedding-ada-002-v2",
+    "ft:gpt-4o-2024-08-06",
+    "ft:gpt-4o-mini-2024-07-18",
+    "ft:gpt-3.5-turbo",
+    "ft:davinci-002",
+    "ft:babbage-002",
+]
+
+
+def update_model_prices(file_path):
+    # Read the JSON file as text first to preserve number formatting
+    with open(file_path, "r") as file:
+        original_text = file.read()
+        data = json.loads(original_text)
+
+    # Update specified models
+    for model_name in models_to_update:
+        print("finding model", model_name)
+        if model_name in data:
+            print("found model")
+            model = data[model_name]
+            if "input_cost_per_token" in model:
+                # Format new values to match original style
+                model["input_cost_per_token_batches"] = float(
+                    "{:.12f}".format(model["input_cost_per_token"] / 2)
+                )
+            if "output_cost_per_token" in model:
+                model["output_cost_per_token_batches"] = float(
+                    "{:.12f}".format(model["output_cost_per_token"] / 2)
+                )
+        print("new pricing for model=")
+        # Convert all float values to full decimal format before printing
+        formatted_model = {
+            k: "{:.9f}".format(v) if isinstance(v, float) else v
+            for k, v in data[model_name].items()
+        }
+        print(json.dumps(formatted_model, indent=4))
+
+
+# Run the update
+file_path = "model_prices_and_context_window.json"
+update_model_prices(file_path)
--- a/cookbook/mlflow_langchain_tracing_litellm_proxy.ipynb
+++ b/cookbook/mlflow_langchain_tracing_litellm_proxy.ipynb
@ -111,7 +111,6 @@
   },
   "outputs": [],
   "source": [
-    "import mlflow\n",
    "mlflow.langchain.autolog()"
   ]
  },
--- a/db_scripts/create_views.py
+++ b/db_scripts/create_views.py
@ -3,7 +3,6 @@ python script to pre-create all views required by LiteLLM Proxy Server
 """

 import asyncio
-import os

 # Enter your DATABASE_URL here

@ -33,7 +32,7 @@ async def check_view_exists():  # noqa: PLR0915
        # Try to select one row from the view
        await db.query_raw("""SELECT 1 FROM "LiteLLM_VerificationTokenView" LIMIT 1""")
        print("LiteLLM_VerificationTokenView Exists!")  # noqa
-    except Exception as e:
+    except Exception:
        # If an error occurs, the view does not exist, so create it
        await db.execute_raw(
            """
@ -54,7 +53,7 @@ async def check_view_exists():  # noqa: PLR0915
    try:
        await db.query_raw("""SELECT 1 FROM "MonthlyGlobalSpend" LIMIT 1""")
        print("MonthlyGlobalSpend Exists!")  # noqa
-    except Exception as e:
+    except Exception:
        sql_query = """
        CREATE OR REPLACE VIEW "MonthlyGlobalSpend" AS 
        SELECT
@ -74,7 +73,7 @@ async def check_view_exists():  # noqa: PLR0915
    try:
        await db.query_raw("""SELECT 1 FROM "Last30dKeysBySpend" LIMIT 1""")
        print("Last30dKeysBySpend Exists!")  # noqa
-    except Exception as e:
+    except Exception:
        sql_query = """
        CREATE OR REPLACE VIEW "Last30dKeysBySpend" AS
        SELECT 
@ -102,7 +101,7 @@ async def check_view_exists():  # noqa: PLR0915
    try:
        await db.query_raw("""SELECT 1 FROM "Last30dModelsBySpend" LIMIT 1""")
        print("Last30dModelsBySpend Exists!")  # noqa
-    except Exception as e:
+    except Exception:
        sql_query = """
        CREATE OR REPLACE VIEW "Last30dModelsBySpend" AS
        SELECT
@ -124,7 +123,7 @@ async def check_view_exists():  # noqa: PLR0915
    try:
        await db.query_raw("""SELECT 1 FROM "MonthlyGlobalSpendPerKey" LIMIT 1""")
        print("MonthlyGlobalSpendPerKey Exists!")  # noqa
-    except Exception as e:
+    except Exception:
        sql_query = """
            CREATE OR REPLACE VIEW "MonthlyGlobalSpendPerKey" AS 
            SELECT
@ -147,7 +146,7 @@ async def check_view_exists():  # noqa: PLR0915
            """SELECT 1 FROM "MonthlyGlobalSpendPerUserPerKey" LIMIT 1"""
        )
        print("MonthlyGlobalSpendPerUserPerKey Exists!")  # noqa
-    except Exception as e:
+    except Exception:
        sql_query = """
            CREATE OR REPLACE VIEW "MonthlyGlobalSpendPerUserPerKey" AS 
            SELECT
@ -169,11 +168,11 @@ async def check_view_exists():  # noqa: PLR0915
        print("MonthlyGlobalSpendPerUserPerKey Created!")  # noqa

    try:
-        await db.query_raw("""SELECT 1 FROM DailyTagSpend LIMIT 1""")
+        await db.query_raw("""SELECT 1 FROM "DailyTagSpend" LIMIT 1""")
        print("DailyTagSpend Exists!")  # noqa
-    except Exception as e:
+    except Exception:
        sql_query = """
-        CREATE OR REPLACE VIEW DailyTagSpend AS
+        CREATE OR REPLACE VIEW "DailyTagSpend" AS
        SELECT
            jsonb_array_elements_text(request_tags) AS individual_request_tag,
            DATE(s."startTime") AS spend_date,
@ -189,7 +188,7 @@ async def check_view_exists():  # noqa: PLR0915
    try:
        await db.query_raw("""SELECT 1 FROM "Last30dTopEndUsersSpend" LIMIT 1""")
        print("Last30dTopEndUsersSpend Exists!")  # noqa
-    except Exception as e:
+    except Exception:
        sql_query = """
        CREATE VIEW "Last30dTopEndUsersSpend" AS
        SELECT end_user, COUNT(*) AS total_events, SUM(spend) AS total_spend
--- a/deploy/charts/litellm-helm/ci/test-values.yaml
+++ b/deploy/charts/litellm-helm/ci/test-values.yaml
@ -0,0 +1,15 @@
+fullnameOverride: ""
+# Disable database deployment and configuration
+db:
+  deployStandalone: false
+  useExisting: false
+
+# Test environment variables
+envVars:
+  DD_ENV: "dev_helm"
+  DD_SERVICE: "litellm"
+  USE_DDTRACE: "true"
+
+# Disable migration job since we're not using a database
+migrationJob:
+  enabled: false
--- a/deploy/charts/litellm-helm/templates/deployment.yaml
+++ b/deploy/charts/litellm-helm/templates/deployment.yaml
@ -91,6 +91,12 @@ spec:
                  name: {{ include "redis.secretName" .Subcharts.redis }}
                  key: {{include "redis.secretPasswordKey" .Subcharts.redis }}
            {{- end }}
+            {{- if .Values.envVars }}
+            {{- range $key, $val := .Values.envVars }}
+            - name: {{ $key }}
+              value: {{ $val | quote }}
+            {{- end }}
+            {{- end }}
          envFrom:
          {{- range .Values.environmentSecrets }}
            - secretRef:
--- a/deploy/charts/litellm-helm/templates/migrations-job.yaml
+++ b/deploy/charts/litellm-helm/templates/migrations-job.yaml
@ -1,19 +1,27 @@
+{{- if .Values.migrationJob.enabled }}
 # This job runs the prisma migrations for the LiteLLM DB.
-
 apiVersion: batch/v1
 kind: Job
 metadata:
  name: {{ include "litellm.fullname" . }}-migrations
  annotations:
    argocd.argoproj.io/hook: PreSync
-    argocd.argoproj.io/hook-delete-policy: Never # keep this resource so we can debug status on ArgoCD
+    argocd.argoproj.io/hook-delete-policy: BeforeHookCreation # delete old migration on a new deploy in case the migration needs to make updates
    checksum/config: {{ toYaml .Values | sha256sum }}
 spec:
  template:
+    metadata:
+      annotations:
+        {{- with .Values.migrationJob.annotations }}
+        {{- toYaml . | nindent 8 }}
+        {{- end }}
    spec:
      containers:
        - name: prisma-migrations
-          image: ghcr.io/berriai/litellm-database:main-latest
+          image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default (printf "main-%s" .Chart.AppVersion) }}"
+          imagePullPolicy: {{ .Values.image.pullPolicy }}
+          securityContext:
+            {{- toYaml .Values.securityContext | nindent 12 }}
          command: ["python", "litellm/proxy/prisma_migration.py"]
          workingDir: "/app"
          env:
@ -42,3 +50,4 @@ spec:
              value: "false" # always run the migration from the Helm PreSync hook, override the value set
      restartPolicy: OnFailure
  backoffLimit: {{ .Values.migrationJob.backoffLimit }}
+{{- end }}
--- a/deploy/charts/litellm-helm/templates/tests/test-connection.yaml
+++ b/deploy/charts/litellm-helm/templates/tests/test-connection.yaml
@ -10,6 +10,16 @@ spec:
  containers:
    - name: wget
      image: busybox
-      command: ['wget']
-      args: ['{{ include "litellm.fullname" . }}:{{ .Values.service.port }}/health/readiness']
-  restartPolicy: Never
+      command: ['sh', '-c']
+      args:
+        - |
+          # Wait for a bit to allow the service to be ready
+          sleep 10
+          # Try multiple times with a delay between attempts
+          for i in $(seq 1 30); do
+            wget -T 5 "{{ include "litellm.fullname" . }}:{{ .Values.service.port }}/health/readiness" && exit 0
+            echo "Attempt $i failed, waiting..."
+            sleep 2
+          done
+          exit 1
+  restartPolicy: Never
--- a/deploy/charts/litellm-helm/templates/tests/test-env-vars.yaml
+++ b/deploy/charts/litellm-helm/templates/tests/test-env-vars.yaml
@ -0,0 +1,43 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: "{{ include "litellm.fullname" . }}-env-test"
+  labels:
+    {{- include "litellm.labels" . | nindent 4 }}
+  annotations:
+    "helm.sh/hook": test
+spec:
+  containers:
+    - name: test
+      image: busybox
+      command: ['sh', '-c']
+      args:
+        - |
+          # Test DD_ENV
+          if [ "$DD_ENV" != "dev_helm" ]; then
+            echo "❌ Environment variable DD_ENV mismatch. Expected: dev_helm, Got: $DD_ENV"
+            exit 1
+          fi
+          echo "✅ Environment variable DD_ENV matches expected value: $DD_ENV"
+          
+          # Test DD_SERVICE
+          if [ "$DD_SERVICE" != "litellm" ]; then
+            echo "❌ Environment variable DD_SERVICE mismatch. Expected: litellm, Got: $DD_SERVICE"
+            exit 1
+          fi
+          echo "✅ Environment variable DD_SERVICE matches expected value: $DD_SERVICE"
+          
+          # Test USE_DDTRACE
+          if [ "$USE_DDTRACE" != "true" ]; then
+            echo "❌ Environment variable USE_DDTRACE mismatch. Expected: true, Got: $USE_DDTRACE"
+            exit 1
+          fi
+          echo "✅ Environment variable USE_DDTRACE matches expected value: $USE_DDTRACE"
+      env:
+        - name: DD_ENV
+          value: {{ .Values.envVars.DD_ENV | quote }}
+        - name: DD_SERVICE
+          value: {{ .Values.envVars.DD_SERVICE | quote }}
+        - name: USE_DDTRACE
+          value: {{ .Values.envVars.USE_DDTRACE | quote }}
+  restartPolicy: Never
--- a/deploy/charts/litellm-helm/values.yaml
+++ b/deploy/charts/litellm-helm/values.yaml
@ -186,5 +186,11 @@ migrationJob:
  retries: 3 # Number of retries for the Job in case of failure
  backoffLimit: 4 # Backoff limit for Job restarts
  disableSchemaUpdate: false # Skip schema migrations for specific environments. When True, the job will exit with code 0.
+  annotations: {}
+
+# Additional environment variables to be added to the deployment
+envVars: {
+    # USE_DDTRACE: "true"
+}


--- a/dist/litellm-1.57.6.tar.gz
+++ b/dist/litellm-1.57.6.tar.gz
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -10,14 +10,9 @@ services:
    ## Uncomment these lines to start proxy with a config.yaml file ##
    # volumes:
    #  - ./config.yaml:/app/config.yaml <<- this is missing in the docker-compose file currently
-    # The below two are my suggestion
    # command:
    #  - "--config=/app/config.yaml"
    ##############################################
-    #########################################
-    ## Uncomment these lines to start proxy with a config.yaml file ##
-    # volumes:
-    ###############################################
    ports:
      - "4000:4000" # Map the container port to the host, change the host port if necessary
    environment:
@ -34,6 +29,8 @@ services:
      POSTGRES_DB: litellm
      POSTGRES_USER: llmproxy
      POSTGRES_PASSWORD: dbpassword9090
+    ports:
+      - "5432:5432"
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -d litellm -U llmproxy"]
      interval: 1s
--- a/docker/Dockerfile.alpine
+++ b/docker/Dockerfile.alpine
@ -11,9 +11,7 @@ FROM $LITELLM_BUILD_IMAGE AS builder
 WORKDIR /app

 # Install build dependencies
-RUN apk update && \
-    apk add --no-cache gcc python3-dev musl-dev && \
-    rm -rf /var/cache/apk/*
+RUN apk add --no-cache gcc python3-dev musl-dev

 RUN pip install --upgrade pip && \
    pip install build
@ -48,8 +46,11 @@ COPY --from=builder /wheels/ /wheels/
 # Install the built wheel using pip; again using a wildcard if it's the only file
 RUN pip install *.whl /wheels/* --no-index --find-links=/wheels/ && rm -f *.whl && rm -rf /wheels

+RUN chmod +x docker/entrypoint.sh
+RUN chmod +x docker/prod_entrypoint.sh
+
 EXPOSE 4000/tcp

 # Set your entrypoint and command
-ENTRYPOINT ["litellm"]
+ENTRYPOINT ["docker/prod_entrypoint.sh"]
 CMD ["--port", "4000"]
--- a/docker/Dockerfile.custom_ui
+++ b/docker/Dockerfile.custom_ui
@ -33,6 +33,7 @@ WORKDIR /app

 # Make sure your docker/entrypoint.sh is executable
 RUN chmod +x docker/entrypoint.sh
+RUN chmod +x docker/prod_entrypoint.sh

 # Expose the necessary port
 EXPOSE 4000/tcp
--- a/docker/Dockerfile.database
+++ b/docker/Dockerfile.database
@ -1,18 +1,20 @@
 # Base image for building
-ARG LITELLM_BUILD_IMAGE=python:3.11.8-slim
+ARG LITELLM_BUILD_IMAGE=cgr.dev/chainguard/python:latest-dev

 # Runtime image
-ARG LITELLM_RUNTIME_IMAGE=python:3.11.8-slim
+ARG LITELLM_RUNTIME_IMAGE=cgr.dev/chainguard/python:latest-dev
 # Builder stage
 FROM $LITELLM_BUILD_IMAGE AS builder

 # Set the working directory to /app
 WORKDIR /app

+USER root
+
 # Install build dependencies
-RUN apt-get clean && apt-get update && \
-    apt-get install -y gcc python3-dev && \
-    rm -rf /var/lib/apt/lists/*
+RUN apk update && \
+    apk add --no-cache gcc python3-dev openssl openssl-dev
+

 RUN pip install --upgrade pip && \
    pip install build
@ -38,8 +40,12 @@ RUN pip wheel --no-cache-dir --wheel-dir=/wheels/ -r requirements.txt
 # Runtime stage
 FROM $LITELLM_RUNTIME_IMAGE AS runtime

-# Update dependencies and clean up - handles debian security issue
-RUN apt-get update && apt-get upgrade -y && rm -rf /var/lib/apt/lists/* 
+# Ensure runtime stage runs as root
+USER root
+
+# Install runtime dependencies
+RUN apk update && \
+    apk add --no-cache openssl

 WORKDIR /app
 # Copy the current directory contents into the container at /app
@ -67,12 +73,12 @@ RUN chmod +x docker/build_admin_ui.sh && ./docker/build_admin_ui.sh
 # Generate prisma client
 RUN prisma generate
 RUN chmod +x docker/entrypoint.sh
-
+RUN chmod +x docker/prod_entrypoint.sh
 EXPOSE 4000/tcp

 # # Set your entrypoint and command

-ENTRYPOINT ["litellm"]
+ENTRYPOINT ["docker/prod_entrypoint.sh"]

 # Append "--detailed_debug" to the end of CMD to view detailed debug logs 
 # CMD ["--port", "4000", "--detailed_debug"]
--- a/docker/Dockerfile.non_root
+++ b/docker/Dockerfile.non_root
@ -1,21 +1,24 @@
 # Base image for building
-ARG LITELLM_BUILD_IMAGE=python:3.11.8-slim
+ARG LITELLM_BUILD_IMAGE=python:3.13.1-slim

 # Runtime image
-ARG LITELLM_RUNTIME_IMAGE=python:3.11.8-slim
+ARG LITELLM_RUNTIME_IMAGE=python:3.13.1-slim
 # Builder stage
 FROM $LITELLM_BUILD_IMAGE AS builder

 # Set the working directory to /app
 WORKDIR /app

+# Set the shell to bash
+SHELL ["/bin/bash", "-o", "pipefail", "-c"]
+
 # Install build dependencies
 RUN apt-get clean && apt-get update && \
    apt-get install -y gcc python3-dev && \
    rm -rf /var/lib/apt/lists/*

-RUN pip install --upgrade pip && \
-    pip install build
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir build

 # Copy the current directory contents into the container at /app
 COPY . .
@ -39,7 +42,7 @@ RUN pip wheel --no-cache-dir --wheel-dir=/wheels/ -r requirements.txt
 FROM $LITELLM_RUNTIME_IMAGE AS runtime

 # Update dependencies and clean up - handles debian security issue
-RUN apt-get update && apt-get upgrade -y && rm -rf /var/lib/apt/lists/* 
+RUN apt-get update && apt-get upgrade -y && rm -rf /var/lib/apt/lists/*

 WORKDIR /app
 # Copy the current directory contents into the container at /app
@ -53,32 +56,42 @@ COPY --from=builder /wheels/ /wheels/
 # Install the built wheel using pip; again using a wildcard if it's the only file
 RUN pip install *.whl /wheels/* --no-index --find-links=/wheels/ && rm -f *.whl && rm -rf /wheels

-# install semantic-cache [Experimental]- we need this here and not in requirements.txt because redisvl pins to pydantic 1.0 
-RUN pip install redisvl==0.0.7 --no-deps
-
+# install semantic-cache [Experimental]- we need this here and not in requirements.txt because redisvl pins to pydantic 1.0
 # ensure pyjwt is used, not jwt
-RUN pip uninstall jwt -y
-RUN pip uninstall PyJWT -y
-RUN pip install PyJWT==2.9.0 --no-cache-dir
+RUN pip install redisvl==0.0.7 --no-deps --no-cache-dir && \
+    pip uninstall jwt -y && \
+    pip uninstall PyJWT -y && \
+    pip install PyJWT==2.9.0 --no-cache-dir

 # Build Admin UI
 RUN chmod +x docker/build_admin_ui.sh && ./docker/build_admin_ui.sh

-# Generate prisma client
-ENV PRISMA_BINARY_CACHE_DIR=/app/prisma
-RUN mkdir -p /.cache
-RUN chmod -R 777 /.cache
-RUN pip install nodejs-bin
-RUN pip install prisma
-RUN prisma generate
+### Prisma Handling for Non-Root #################################################
+# Prisma allows you to specify the binary cache directory to use
+ENV PRISMA_BINARY_CACHE_DIR=/nonexistent
+
+RUN pip install --no-cache-dir nodejs-bin prisma
+
+# Make a /non-existent folder and assign chown to nobody
+RUN mkdir -p /nonexistent && \
+    chown -R nobody:nogroup /app && \
+    chown -R nobody:nogroup /nonexistent && \
+    chown -R nobody:nogroup /usr/local/lib/python3.13/site-packages/prisma/
+
 RUN chmod +x docker/entrypoint.sh
+RUN chmod +x docker/prod_entrypoint.sh
+
+# Run Prisma generate as user = nobody
+USER nobody
+
+RUN prisma generate
+### End of Prisma Handling for Non-Root #########################################

 EXPOSE 4000/tcp

 # # Set your entrypoint and command
+ENTRYPOINT ["docker/prod_entrypoint.sh"]

-ENTRYPOINT ["litellm"]
-
-# Append "--detailed_debug" to the end of CMD to view detailed debug logs 
+# Append "--detailed_debug" to the end of CMD to view detailed debug logs
 # CMD ["--port", "4000", "--detailed_debug"]
 CMD ["--port", "4000"]
--- a/docker/build_from_pip/Dockerfile.build_from_pip
+++ b/docker/build_from_pip/Dockerfile.build_from_pip
@ -0,0 +1,23 @@
+FROM cgr.dev/chainguard/python:latest-dev
+
+USER root
+WORKDIR /app
+
+ENV HOME=/home/litellm
+ENV PATH="${HOME}/venv/bin:$PATH"
+
+# Install runtime dependencies
+RUN apk update && \
+    apk add --no-cache gcc python3-dev openssl openssl-dev
+
+RUN python -m venv ${HOME}/venv
+RUN ${HOME}/venv/bin/pip install --no-cache-dir --upgrade pip
+
+COPY requirements.txt .
+RUN --mount=type=cache,target=${HOME}/.cache/pip \
+    ${HOME}/venv/bin/pip install -r requirements.txt
+
+EXPOSE 4000/tcp
+
+ENTRYPOINT ["litellm"]
+CMD ["--port", "4000"]
--- a/docker/build_from_pip/Readme.md
+++ b/docker/build_from_pip/Readme.md
@ -0,0 +1,9 @@
+# Docker to build LiteLLM Proxy from litellm pip package
+
+### When to use this ?
+
+If you need to build LiteLLM Proxy from litellm pip package, you can use this Dockerfile as a reference.
+
+### Why build from pip package ?
+
+- If your company has a strict requirement around security / building images you can follow steps outlined here 
--- a/docker/build_from_pip/litellm_config.yaml
+++ b/docker/build_from_pip/litellm_config.yaml
@ -0,0 +1,9 @@
+model_list:
+  - model_name: "gpt-4"
+    litellm_params:
+      model: openai/fake
+      api_key: fake-key
+      api_base: https://exampleopenaiendpoint-production.up.railway.app/
+
+general_settings:    
+    alerting: ["slack"]
--- a/docker/build_from_pip/requirements.txt
+++ b/docker/build_from_pip/requirements.txt
@ -0,0 +1,5 @@
+litellm[proxy] # Specify the litellm version you want to use
+prometheus_client
+langfuse
+prisma
+ddtrace==2.19.0 # for advanced DD tracing / profiling
--- a/docker/prod_entrypoint.sh
+++ b/docker/prod_entrypoint.sh
@ -0,0 +1,8 @@
+#!/bin/sh
+
+if [ "$USE_DDTRACE" = "true" ]; then
+    export DD_TRACE_OPENAI_ENABLED="False"
+    exec ddtrace-run litellm "$@"
+else
+    exec litellm "$@"
+fi
--- a/docker/tests/nonroot.yaml
+++ b/docker/tests/nonroot.yaml
@ -0,0 +1,18 @@
+schemaVersion: 2.0.0
+
+metadataTest:
+  entrypoint: ["docker/prod_entrypoint.sh"]
+  user: "nobody"
+  workdir: "/app"
+
+fileExistenceTests:
+  - name: "Prisma Folder"
+    path: "/usr/local/lib/python3.13/site-packages/prisma/"
+    shouldExist: true
+    uid: 65534
+    gid: 65534
+  - name: "Prisma Schema"
+    path: "/usr/local/lib/python3.13/site-packages/prisma/schema.prisma"
+    shouldExist: true
+    uid: 65534
+    gid: 65534
--- a/docs/my-website/Dockerfile
+++ b/docs/my-website/Dockerfile
@ -1,4 +1,4 @@
-FROM python:3.10
+FROM python:3.14.0a3-slim

 COPY . /app
 WORKDIR /app
--- a/docs/my-website/blog/2021-08-26-welcome/index.md
+++ b/docs/my-website/blog/2021-08-26-welcome/index.md
@ -1,43 +0,0 @@
-# 🚅 litellm
-A light 100 line package to simplify calling OpenAI, Azure, Cohere, Anthropic APIs 
-
-###### litellm manages:
-* Calling all LLM APIs using the OpenAI format - `completion(model, messages)`
-* Consistent output for all LLM APIs, text responses will always be available at `['choices'][0]['message']['content']`
-* Consistent Exceptions for all LLM APIs, we map RateLimit, Context Window, and Authentication Error exceptions across all providers to their OpenAI equivalents. [see Code](https://github.com/BerriAI/litellm/blob/ba1079ff6698ef238c5c7f771dd2b698ec76f8d9/litellm/utils.py#L250)
-
-###### observability:
-* Logging - see exactly what the raw model request/response is by plugging in your own function `completion(.., logger_fn=your_logging_fn)` and/or print statements from the package `litellm.set_verbose=True`
-* Callbacks - automatically send your data to Helicone, Sentry, Posthog, Slack - `litellm.success_callbacks`, `litellm.failure_callbacks` [see Callbacks](https://litellm.readthedocs.io/en/latest/advanced/)
-
-## Quick Start
-Go directly to code: [Getting Started Notebook](https://colab.research.google.com/drive/1gR3pY-JzDZahzpVdbGBtrNGDBmzUNJaJ?usp=sharing)
-### Installation
-```
-pip install litellm
-```
-
-### Usage
-```python
-from litellm import completion
-
-## set ENV variables
-os.environ["OPENAI_API_KEY"] = "openai key"
-os.environ["COHERE_API_KEY"] = "cohere key"
-
-messages = [{ "content": "Hello, how are you?","role": "user"}]
-
-# openai call
-response = completion(model="gpt-3.5-turbo", messages=messages)
-
-# cohere call
-response = completion("command-nightly", messages)
-```
-Need Help / Support : [see troubleshooting](https://litellm.readthedocs.io/en/latest/troubleshoot)
-
-## Why did we build liteLLM 
- **Need for simplicity**: Our code started to get extremely complicated managing & translating calls between Azure, OpenAI, Cohere
-
-## Support
-* [Meet with us 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version)
-* Contact us at ishaan@berri.ai / krrish@berri.ai
--- a/docs/my-website/docs/adding_provider/directory_structure.md
+++ b/docs/my-website/docs/adding_provider/directory_structure.md
@ -0,0 +1,24 @@
+# Directory Structure
+
+When adding a new provider, you need to create a directory for the provider that follows the following structure:
+
+```
+litellm/llms/
+└── provider_name/
+    ├── completion/ # use when endpoint is equivalent to openai's `/v1/completions`
+    │   ├── handler.py
+    │   └── transformation.py
+    ├── chat/ # use when endpoint is equivalent to openai's `/v1/chat/completions`
+    │   ├── handler.py
+    │   └── transformation.py
+    ├── embed/ # use when endpoint is equivalent to openai's `/v1/embeddings`
+    │   ├── handler.py
+    │   └── transformation.py
+    ├── audio_transcription/ # use when endpoint is equivalent to openai's `/v1/audio/transcriptions`
+    │   ├── handler.py
+    │   └── transformation.py
+    └── rerank/ # use when endpoint is equivalent to cohere's `/rerank` endpoint.
+        ├── handler.py
+        └── transformation.py
+```
+
--- a/docs/my-website/docs/adding_provider/new_rerank_provider.md
+++ b/docs/my-website/docs/adding_provider/new_rerank_provider.md
@ -0,0 +1,84 @@
+# Add Rerank Provider
+
+LiteLLM **follows the Cohere Rerank API format** for all rerank providers. Here's how to add a new rerank provider:
+
+## 1. Create a transformation.py file
+
+Create a config class named `<Provider><Endpoint>Config` that inherits from [`BaseRerankConfig`](https://github.com/BerriAI/litellm/blob/main/litellm/llms/base_llm/rerank/transformation.py):
+
+```python
+from litellm.types.rerank import OptionalRerankParams, RerankRequest, RerankResponse
+class YourProviderRerankConfig(BaseRerankConfig):
+    def get_supported_cohere_rerank_params(self, model: str) -> list:
+        return [
+            "query",
+            "documents",
+            "top_n",
+            # ... other supported params
+        ]
+
+    def transform_rerank_request(self, model: str, optional_rerank_params: OptionalRerankParams, headers: dict) -> dict:
+        # Transform request to RerankRequest spec
+        return rerank_request.model_dump(exclude_none=True)
+
+    def transform_rerank_response(self, model: str, raw_response: httpx.Response, ...) -> RerankResponse:
+        # Transform provider response to RerankResponse
+        return RerankResponse(**raw_response_json)
+```
+
+
+## 2. Register Your Provider
+Add your provider to `litellm.utils.get_provider_rerank_config()`:
+
+```python
+elif litellm.LlmProviders.YOUR_PROVIDER == provider:
+    return litellm.YourProviderRerankConfig()
+```
+
+
+## 3. Add Provider to `rerank_api/main.py`
+
+Add a code block to handle when your provider is called. Your provider should use the `base_llm_http_handler.rerank` method
+
+
+```python
+elif _custom_llm_provider == "your_provider":
+    ...
+    response = base_llm_http_handler.rerank(
+        model=model,
+        custom_llm_provider=_custom_llm_provider,
+        optional_rerank_params=optional_rerank_params,
+        logging_obj=litellm_logging_obj,
+        timeout=optional_params.timeout,
+        api_key=dynamic_api_key or optional_params.api_key,
+        api_base=api_base,
+        _is_async=_is_async,
+        headers=headers or litellm.headers or {},
+        client=client,
+        mod el_response=model_response,
+    )
+    ...
+```
+
+## 4. Add Tests
+
+Add a test file to [`tests/llm_translation`](https://github.com/BerriAI/litellm/tree/main/tests/llm_translation)
+
+```python
+def test_basic_rerank_cohere():
+    response = litellm.rerank(
+        model="cohere/rerank-english-v3.0",
+        query="hello",
+        documents=["hello", "world"],
+        top_n=3,
+    )
+
+    print("re rank response: ", response)
+
+    assert response.id is not None
+    assert response.results is not None
+```
+
+
+## Reference PRs
+- [Add Infinity Rerank](https://github.com/BerriAI/litellm/pull/7321)
--- a/docs/my-website/docs/audio_transcription.md
+++ b/docs/my-website/docs/audio_transcription.md
@ -105,4 +105,12 @@ transcript = client.audio.transcriptions.create(
 )
 ```
 </TabItem>
-</Tabs>
+</Tabs>
+
+## Supported Providers
+
+- OpenAI
+- Azure
+- [Fireworks AI](./providers/fireworks_ai.md#audio-transcription)
+- [Groq](./providers/groq.md#speech-to-text---whisper)
+- [Deepgram](./providers/deepgram.md)
--- a/docs/my-website/docs/batches.md
+++ b/docs/my-website/docs/batches.md
@ -5,6 +5,12 @@ import TabItem from '@theme/TabItem';

 Covers Batches, Files

+| Feature | Supported | Notes | 
+|-------|-------|-------|
+| Supported Providers | OpenAI, Azure, Vertex | - |
+| ✨ Cost Tracking | ✅ | LiteLLM Enterprise only |
+| Logging | ✅ | Works across all logging integrations |
+
 ## Quick Start 

 - Create File for Batch Completion
@ -144,4 +150,23 @@ print("list_batches_response=", list_batches_response)
 ### [Vertex AI](./providers/vertex#batch-apis)


+## How Cost Tracking for Batches API Works
+
+LiteLLM tracks batch processing costs by logging two key events:
+
+| Event Type | Description | When it's Logged |
+|------------|-------------|------------------|
+| `acreate_batch` | Initial batch creation | When batch request is submitted |
+| `batch_success` | Final usage and cost | When batch processing completes |
+
+Cost calculation:
+
+- LiteLLM polls the batch status until completion
+- Upon completion, it aggregates usage and costs from all responses in the output file
+- Total `token` and `response_cost` reflect the combined metrics across all batch responses
+
+
+
+
+
 ## [Swagger API Reference](https://litellm-api.up.railway.app/#/batch)
--- a/docs/my-website/docs/benchmarks.md
+++ b/docs/my-website/docs/benchmarks.md
@ -1,21 +1,61 @@
+
+import Image from '@theme/IdealImage';
+
 # Benchmarks

-Benchmarks for LiteLLM Gateway (Proxy Server)
+Benchmarks for LiteLLM Gateway (Proxy Server) tested against a fake OpenAI endpoint.

-Locust Settings:
- 2500 Users
- 100 user Ramp Up
+Use this config for testing:
+
+**Note:**  we're currently migrating to aiohttp which has 10x higher throughput. We recommend using the `aiohttp_openai/` provider for load testing.
+
+```yaml
+model_list:
+  - model_name: "fake-openai-endpoint"
+    litellm_params:
+      model: aiohttp_openai/any
+      api_base: https://your-fake-openai-endpoint.com/chat/completions
+      api_key: "test"
+```
+
+### 1 Instance LiteLLM Proxy
+
+In these tests the median latency of directly calling the fake-openai-endpoint is 60ms.
+
+| Metric | Litellm Proxy (1 Instance) |
+|--------|------------------------|
+| RPS | 475 |
+| Median Latency (ms) | 100 |
+| Latency overhead added by LiteLLM Proxy | 40ms |
+
+<!-- <Image img={require('../img/1_instance_proxy.png')} /> -->
+
+<!-- ## **Horizontal Scaling - 10K RPS**
+
+<Image img={require('../img/instances_vs_rps.png')} /> -->
+
+#### Key Findings
+- Single instance: 475 RPS @ 100ms latency
+- 2 LiteLLM instances: 950 RPS @ 100ms latency
+- 4 LiteLLM instances: 1900 RPS @ 100ms latency
+
+### 2 Instances
+
+**Adding 1 instance, will double the RPS and maintain the `100ms-110ms` median latency.**
+
+| Metric | Litellm Proxy (2 Instances) |
+|--------|------------------------|
+| Median Latency (ms) | 100 |
+| RPS | 950 |


-## Basic Benchmarks
+## Machine Spec used for testing

-Overhead when using a Deployed Proxy vs Direct to LLM
- Latency overhead added by LiteLLM Proxy: 107ms
+Each machine deploying LiteLLM had the following specs:
+
+- 2 CPU
+- 4GB RAM

-| Metric | Direct to Fake Endpoint | Basic Litellm Proxy |
-|--------|------------------------|---------------------|
-| RPS | 1196 | 1133.2 |
-| Median Latency (ms) | 33 | 140 |


 ## Logging Callbacks
@ -39,3 +79,9 @@ Using LangSmith has **no impact on latency, RPS compared to Basic Litellm Proxy*
 | RPS | 1133.2 | 1135 |
 | Median Latency (ms) | 140 | 132 |

+
+
+## Locust Settings
+
+- 2500 Users
+- 100 user Ramp Up
--- a/docs/my-website/docs/completion/function_call.md
+++ b/docs/my-website/docs/completion/function_call.md
@ -8,6 +8,7 @@ Use `litellm.supports_function_calling(model="")` -> returns `True` if model sup
 assert litellm.supports_function_calling(model="gpt-3.5-turbo") == True
 assert litellm.supports_function_calling(model="azure/gpt-4-1106-preview") == True
 assert litellm.supports_function_calling(model="palm/chat-bison") == False
+assert litellm.supports_function_calling(model="xai/grok-2-latest") == True
 assert litellm.supports_function_calling(model="ollama/llama2") == False
 ```

--- a/docs/my-website/docs/completion/input.md
+++ b/docs/my-website/docs/completion/input.md
@ -44,6 +44,7 @@ Use `litellm.get_supported_openai_params()` for an updated list of params for ea
 |Anthropic| ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ |  |  |   |  |  |  |✅ | ✅ | | ✅ | ✅ |  |  | ✅ |
 |OpenAI| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ | ✅ |
 |Azure OpenAI| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ |✅ | ✅ |  |  | ✅ |
+|xAI| ✅ |  | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |  |  | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |  |
 |Replicate | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | |  |   |  |   |
 |Anyscale | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
 |Cohere| ✅ | ✅ | ✅ | ✅ |  ✅ | ✅ | ✅ | ✅ | ✅ |   |   |
@ -191,6 +192,10 @@ def completion(
        
 - `top_logprobs`: *int (optional)* - An integer between 0 and 5 specifying the number of most likely tokens to return at each token position, each with an associated log probability. `logprobs` must be set to true if this parameter is used.

+- `headers`: *dict (optional)* - A dictionary of headers to be sent with the request.
+
+- `extra_headers`: *dict (optional)* - Alternative to `headers`, used to send extra headers in LLM API request. 
+
 #### Deprecated Params
 - `functions`: *array* - A list of functions that the model may use to generate JSON inputs. Each function should have the following properties:

--- a/docs/my-website/docs/completion/json_mode.md
+++ b/docs/my-website/docs/completion/json_mode.md
@ -89,6 +89,7 @@ response_format: { "type": "json_schema", "json_schema": … , "strict": true }
 Works for:
 - OpenAI models 
 - Azure OpenAI models
+- xAI models (Grok-2 or later)
 - Google AI Studio - Gemini models
 - Vertex AI models (Gemini + Anthropic)
 - Bedrock Models
--- a/docs/my-website/docs/completion/stream.md
+++ b/docs/my-website/docs/completion/stream.md
@ -3,9 +3,11 @@ import TabItem from '@theme/TabItem';

 # Streaming + Async

- [Streaming Responses](#streaming-responses)
- [Async Completion](#async-completion)
- [Async + Streaming Completion](#async-streaming)
+| Feature | LiteLLM SDK | LiteLLM Proxy |
+|---------|-------------|---------------|
+| Streaming | ✅ [start here](#streaming-responses) | ✅ [start here](../proxy/user_keys#streaming) |
+| Async | ✅ [start here](#async-completion) | ✅ [start here](../proxy/user_keys#streaming) |
+| Async Streaming | ✅ [start here](#async-streaming) | ✅ [start here](../proxy/user_keys#streaming) |

 ## Streaming Responses
 LiteLLM supports streaming the model response back by passing `stream=True` as an argument to the completion function
--- a/docs/my-website/docs/completion/vision.md
+++ b/docs/my-website/docs/completion/vision.md
@ -118,9 +118,11 @@ response = client.chat.completions.create(
 Use `litellm.supports_vision(model="")` -> returns `True` if model supports `vision` and `False` if not

 ```python
-assert litellm.supports_vision(model="gpt-4-vision-preview") == True
-assert litellm.supports_vision(model="gemini-1.0-pro-vision") == True
-assert litellm.supports_vision(model="gpt-3.5-turbo") == False
+assert litellm.supports_vision(model="openai/gpt-4-vision-preview") == True
+assert litellm.supports_vision(model="vertex_ai/gemini-1.0-pro-vision") == True
+assert litellm.supports_vision(model="openai/gpt-3.5-turbo") == False
+assert litellm.supports_vision(model="xai/grok-2-vision-latest") == True
+assert litellm.supports_vision(model="xai/grok-2-latest") == False
 ```
 </TabItem>

--- a/docs/my-website/docs/data_retention.md
+++ b/docs/my-website/docs/data_retention.md
@ -0,0 +1,47 @@
+# Data Retention Policy
+
+## LiteLLM Cloud
+
+### Purpose
+This policy outlines the requirements and controls/procedures LiteLLM Cloud has implemented to manage the retention and deletion of customer data.
+
+### Policy
+
+For Customers
+1. Active Accounts
+
+- Customer data is retained for as long as the customer’s account is in active status. This includes data such as prompts, generated content, logs, and usage metrics.
+
+2. Voluntary Account Closure
+
+- Data enters an “expired” state when the account is voluntarily closed.
+- Expired account data will be retained for 30 days (adjust as needed).
+- After this period, the account and all related data will be permanently removed from LiteLLM Cloud systems.
+- Customers who wish to voluntarily close their account should download or back up their data (manually or via available APIs) before initiating the closure process.
+
+3. Involuntary Suspension
+
+- If a customer account is involuntarily suspended (e.g., due to non-payment or violation of Terms of Service), there is a 14-day (adjust as needed) grace period during which the account will be inaccessible but can be reopened if the customer resolves the issues leading to suspension.
+- After the grace period, if the account remains unresolved, it will be closed and the data will enter the “expired” state.
+- Once data is in the “expired” state, it will be permanently removed 30 days (adjust as needed) thereafter, unless legal requirements dictate otherwise.
+
+4. Manual Backup of Suspended Accounts
+
+- If a customer wishes to manually back up data contained in a suspended account, they must bring the account back to good standing (by resolving payment or policy violations) to regain interface/API access.
+- Data from a suspended account will not be accessible while the account is in suspension status.
+- After 14 days of suspension (adjust as needed), if no resolution is reached, the account is closed and data follows the standard “expired” data removal timeline stated above.
+
+5. Custom Retention Policies
+
+- Enterprise customers can configure custom data retention periods based on their specific compliance and business requirements.
+- Available customization options include:
+  - Adjusting the retention period for active data (0-365 days)
+- Custom retention policies must be configured through the LiteLLM Cloud dashboard or via API
+
+
+### Protection of Records
+
+- LiteLLM Cloud takes measures to ensure that all records under its control are protected against loss, destruction, falsification, and unauthorized access or disclosure. These measures are aligned with relevant legislative, regulatory, contractual, and business obligations.
+- When working with a third-party CSP, LiteLLM Cloud requests comprehensive information regarding the CSP’s security mechanisms to protect data, including records stored or processed on behalf of LiteLLM Cloud.
+- Cloud service providers engaged by LiteLLM Cloud must disclose their safeguarding practices for records they gather and store on LiteLLM Cloud’s behalf.
+
--- a/docs/my-website/docs/data_security.md
+++ b/docs/my-website/docs/data_security.md
@ -1,5 +1,25 @@
 # Data Privacy and Security

+At LiteLLM, **safeguarding your data privacy and security** is our top priority. We recognize the critical importance of the data you share with us and handle it with the highest level of diligence.
+
+With LiteLLM Cloud, we handle:
+
+- Deployment
+- Scaling
+- Upgrades and security patches
+- Ensuring high availability
+
+  <iframe
+    src="https://status.litellm.ai/badge?theme=light"
+    width="250"
+    height="30"
+    className="inline-block dark:hidden"
+    style={{
+      colorScheme: "light",
+      marginTop: "5px",
+    }}
+  ></iframe>
+
 ## Security Measures

 ### LiteLLM Cloud
@ -12,17 +32,24 @@
 - Audit Logs with retention policy
 - Control Allowed IP Addresses that can access your Cloud LiteLLM Instance

-For security inquiries, please contact us at support@berri.ai
-
 ### Self-hosted Instances LiteLLM

- ** No data or telemetry is stored on LiteLLM Servers when you self host **
- For installation and configuration, see: [Self-hosting guided](../docs/proxy/deploy.md)
- **Telemetry** We run no telemetry when you self host LiteLLM
+- **No data or telemetry is stored on LiteLLM Servers when you self-host**
+- For installation and configuration, see: [Self-hosting guide](../docs/proxy/deploy.md)
+- **Telemetry**: We run no telemetry when you self-host LiteLLM

 For security inquiries, please contact us at support@berri.ai

-## Supported data regions for LiteLLM Cloud
+## **Security Certifications**
+
+| **Certification** | **Status**                                                                                      |
+|-------------------|-------------------------------------------------------------------------------------------------|
+| SOC 2 Type I      | Certified. Report available upon request on Enterprise plan.                                                           |
+| SOC 2 Type II     | In progress. Certificate available by April 15th, 2025                   |
+| ISO27001          | In progress. Certificate available by February 7th, 2025                                           |
+
+
+## Supported Data Regions for LiteLLM Cloud

 LiteLLM supports the following data regions:

@ -31,7 +58,7 @@ LiteLLM supports the following data regions:

 All data, user accounts, and infrastructure are completely separated between these two regions

-## Collection of personal data
+## Collection of Personal Data

 ### For Self-hosted LiteLLM Users:
 - No personal data is collected or transmitted to LiteLLM servers when you self-host our software.
@ -40,12 +67,13 @@ All data, user accounts, and infrastructure are completely separated between the
 ### For LiteLLM Cloud Users:
 - LiteLLM Cloud tracks LLM usage data - We do not access or store the message / response content of your API requests or responses. You can see the [fields tracked here](https://github.com/BerriAI/litellm/blob/main/schema.prisma#L174)

-**How to use and share the personal data**
+**How to Use and Share the Personal Data**
 - Only proxy admins can view their usage data, and they can only see the usage data of their organization.
 - Proxy admins have the ability to invite other users / admins to their server to view their own usage data
 - LiteLLM Cloud does not sell or share any usage data with any third parties.

-## Cookies information, security and privacy
+
+## Cookies Information, Security, and Privacy

 ### For Self-hosted LiteLLM Users:
 - Cookie data remains within your own infrastructure.
@ -81,6 +109,12 @@ We value the security community's role in protecting our systems and users. To r

 We'll review all reports promptly. Note that we don't currently offer a bug bounty program.

+## Vulnerability Scanning
+
+- LiteLLM runs [`grype`](https://github.com/anchore/grype) security scans on all built Docker images.
+    - See [`grype litellm` check on ci/cd](https://github.com/BerriAI/litellm/blob/main/.circleci/config.yml#L1099). 
+    - Current Status: ✅ Passing. 0 High/Critical severity vulnerabilities found.
+
 ## Legal/Compliance FAQs

 ### Procurement Options
@ -89,35 +123,37 @@ We'll review all reports promptly. Note that we don't currently offer a bug boun
 2. AWS Marketplace
 3. Azure Marketplace

+
 ### Vendor Information

 Legal Entity Name: Berrie AI Incorporated

 Company Phone Number: 7708783106 

-Number of employees in the company: 2
-
-Number of employees in security team: 2
-
 Point of contact email address for security incidents: krrish@berri.ai

 Point of contact email address for general security-related questions: krrish@berri.ai 

-Has the Vendor been audited / certified? Currently undergoing SOC-2 Certification from Drata 
+Has the Vendor been audited / certified? 
+- SOC 2 Type I. Certified. Report available upon request on Enterprise plan.
+- SOC 2 Type II. In progress. Certificate available by April 15th, 2025.
+- ISO27001. In progress. Certificate available by February 7th, 2025.

-Has an information security management system been implemented? Yes - [CodeQL](https://codeql.github.com/)
+Has an information security management system been implemented? 
+- Yes - [CodeQL](https://codeql.github.com/) and a comprehensive ISMS covering multiple security domains.

-Is logging of key events - auth, creation, update changes occurring? Yes - we have [audit logs](https://docs.litellm.ai/docs/proxy/multiple_admins#1-switch-on-audit-logs)
+Is logging of key events - auth, creation, update changes occurring? 
+- Yes - we have [audit logs](https://docs.litellm.ai/docs/proxy/multiple_admins#1-switch-on-audit-logs)

-Does the Vendor have an established Cybersecurity incident management program? No 
+Does the Vendor have an established Cybersecurity incident management program? 
+- Yes, Incident Response Policy available upon request.

-Not applicable - LiteLLM is self-hosted, this is the responsibility of the team hosting the proxy. We do provide [alerting](https://docs.litellm.ai/docs/proxy/alerting) and [monitoring](https://docs.litellm.ai/docs/proxy/prometheus) tools to help with this. 

 Does the vendor have a vulnerability disclosure policy in place? [Yes](https://github.com/BerriAI/litellm?tab=security-ov-file#security-vulnerability-reporting-guidelines)

-Does the vendor perform vulnerability scans? No 
+Does the vendor perform vulnerability scans? 
+- Yes, regular vulnerability scans are conducted as detailed in the [Vulnerability Scanning](#vulnerability-scanning) section.

 Signer Name: Krish Amit Dholakia

-Signer Email: krrish@berri.ai
-
+Signer Email: krrish@berri.ai
--- a/docs/my-website/docs/debugging/local_debugging.md
+++ b/docs/my-website/docs/debugging/local_debugging.md
@ -1,5 +1,5 @@
 # Local Debugging
-There's 2 ways to do local debugging - `litellm.set_verbose=True` and by passing in a custom function `completion(...logger_fn=<your_local_function>)`. Warning: Make sure to not use `set_verbose` in production. It logs API keys, which might end up in log files.
+There's 2 ways to do local debugging - `litellm._turn_on_debug()` and by passing in a custom function `completion(...logger_fn=<your_local_function>)`. Warning: Make sure to not use `_turn_on_debug()` in production. It logs API keys, which might end up in log files.

 ## Set Verbose 

@ -8,7 +8,7 @@ This is good for getting print statements for everything litellm is doing.
 import litellm
 from litellm import completion

-litellm.set_verbose=True # 👈 this is the 1-line change you need to make
+litellm._turn_on_debug() # 👈 this is the 1-line change you need to make

 ## set ENV variables
 os.environ["OPENAI_API_KEY"] = "openai key"
--- a/docs/my-website/docs/embedding/supported_embedding.md
+++ b/docs/my-website/docs/embedding/supported_embedding.md
@ -323,6 +323,40 @@ response = embedding(
 | embed-english-light-v2.0 | `embedding(model="embed-english-light-v2.0", input=["good morning from litellm", "this is another item"])` |
 | embed-multilingual-v2.0  | `embedding(model="embed-multilingual-v2.0", input=["good morning from litellm", "this is another item"])` |

+## NVIDIA NIM Embedding Models
+
+### API keys
+This can be set as env variables or passed as **params to litellm.embedding()**
+```python
+import os
+os.environ["NVIDIA_NIM_API_KEY"] = ""  # api key
+os.environ["NVIDIA_NIM_API_BASE"] = "" # nim endpoint url
+```
+
+### Usage
+```python
+from litellm import embedding
+import os
+os.environ['NVIDIA_NIM_API_KEY'] = ""
+response = embedding(
+    model='nvidia_nim/<model_name>', 
+    input=["good morning from litellm"]
+)
+```
+All models listed [here](https://build.nvidia.com/explore/retrieval) are supported:
+
+| Model Name         | Function Call                                         |
+| :---               | :---                                                  |
+| NV-Embed-QA | `embedding(model="nvidia_nim/NV-Embed-QA", input)` |
+| nvidia/nv-embed-v1 | `embedding(model="nvidia_nim/nvidia/nv-embed-v1", input)` |
+| nvidia/nv-embedqa-mistral-7b-v2 | `embedding(model="nvidia_nim/nvidia/nv-embedqa-mistral-7b-v2", input)` |
+| nvidia/nv-embedqa-e5-v5 | `embedding(model="nvidia_nim/nvidia/nv-embedqa-e5-v5", input)` |
+| nvidia/embed-qa-4 | `embedding(model="nvidia_nim/nvidia/embed-qa-4", input)` |
+| nvidia/llama-3.2-nv-embedqa-1b-v1 | `embedding(model="nvidia_nim/nvidia/llama-3.2-nv-embedqa-1b-v1", input)` |
+| nvidia/llama-3.2-nv-embedqa-1b-v2 | `embedding(model="nvidia_nim/nvidia/llama-3.2-nv-embedqa-1b-v2", input)` |
+| snowflake/arctic-embed-l | `embedding(model="nvidia_nim/snowflake/arctic-embed-l", input)` |
+| baai/bge-m3 | `embedding(model="nvidia_nim/baai/bge-m3", input)` |
+
 ## HuggingFace Embedding Models
 LiteLLM supports all Feature-Extraction + Sentence Similarity Embedding models: https://huggingface.co/models?pipeline_tag=feature-extraction

@ -394,6 +428,32 @@ print(response)
 |--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | mistral-embed | `embedding(model="mistral/mistral-embed", input)` | 

+## Gemini AI Embedding Models
+
+### API keys
+
+This can be set as env variables or passed as **params to litellm.embedding()**
+```python
+import os
+os.environ["GEMINI_API_KEY"] = ""
+```
+
+### Usage - Embedding
+```python
+from litellm import embedding
+response = embedding(
+  model="gemini/text-embedding-004",
+  input=["good morning from litellm"],
+)
+print(response)
+```
+
+All models listed [here](https://ai.google.dev/gemini-api/docs/models/gemini) are supported:
+
+| Model Name         | Function Call                                         |
+| :---               | :---                                                  |
+| text-embedding-004 | `embedding(model="gemini/text-embedding-004", input)` |
+

 ## Vertex AI Embedding Models

@ -411,7 +471,7 @@ response = embedding(
 print(response)
 ```

-## Supported Models
+### Supported Models
 All models listed [here](https://github.com/BerriAI/litellm/blob/57f37f743886a0249f630a6792d49dffc2c5d9b7/model_prices_and_context_window.json#L835) are supported

 | Model Name               | Function Call                                                                                                                                                      |
@ -509,4 +569,4 @@ curl -X POST 'http://0.0.0.0:4000/v1/embeddings' \
 }'
 ```
 </TabItem>
-</Tabs>
+</Tabs>
--- a/docs/my-website/docs/enterprise.md
+++ b/docs/my-website/docs/enterprise.md
@ -2,63 +2,42 @@
 For companies that need SSO, user management and professional support for LiteLLM Proxy

 :::info
-Interested in Enterprise? Schedule a meeting with us here 👉
-[Talk to founders](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
-
+Get free 7-day trial key [here](https://www.litellm.ai/#trial)
 :::

-Deploy managed LiteLLM Proxy within your VPC.
-
 Includes all enterprise features.

-[**View AWS Marketplace Listing**](https://aws.amazon.com/marketplace/pp/prodview-gdm3gswgjhgjo?sr=0-1&ref_=beagle&applicationId=AWSMPContessa)
-
-[**Get early access**](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
+[**Procurement available via AWS / Azure Marketplace**](./data_security.md#legalcompliance-faqs)


 This covers: 
- **Enterprise Features**
-    - **Security**
-        - ✅ [SSO for Admin UI](./proxy/ui#✨-enterprise-features)
-        - ✅ [Audit Logs with retention policy](./proxy/enterprise#audit-logs)
-        - ✅ [JWT-Auth](../docs/proxy/token_auth.md)
-        - ✅ [Control available public, private routes (Restrict certain endpoints on proxy)](./proxy/enterprise#control-available-public-private-routes)
-        - ✅ [**Secret Managers** AWS Key Manager, Google Secret Manager, Azure Key](./secret)
-        - ✅ IP address‑based access control lists
-        - ✅ Track Request IP Address
-        - ✅ [Use LiteLLM keys/authentication on Pass Through Endpoints](./proxy/pass_through#✨-enterprise---use-litellm-keysauthentication-on-pass-through-endpoints)
-        - ✅ Set Max Request / File Size on Requests
-        - ✅ [Enforce Required Params for LLM Requests (ex. Reject requests missing ["metadata"]["generation_name"])](./proxy/enterprise#enforce-required-params-for-llm-requests)
-    - **Customize Logging, Guardrails, Caching per project**
-        - ✅ [Team Based Logging](./proxy/team_logging.md) - Allow each team to use their own Langfuse Project / custom callbacks
-        - ✅ [Disable Logging for a Team](./proxy/team_logging.md#disable-logging-for-a-team) - Switch off all logging for a team/project (GDPR Compliance)
-    - **Controlling Guardrails by Virtual Keys**
-    - **Spend Tracking & Data Exports**
-        - ✅ [Tracking Spend for Custom Tags](./proxy/enterprise#tracking-spend-for-custom-tags)
-        - ✅ [Exporting LLM Logs to GCS Bucket](./proxy/bucket#🪣-logging-gcs-s3-buckets)
-        - ✅ [API Endpoints to get Spend Reports per Team, API Key, Customer](./proxy/cost_tracking.md#✨-enterprise-api-endpoints-to-get-spend)
-    - **Prometheus Metrics**
-        - ✅ [Prometheus Metrics - Num Requests, failures, LLM Provider Outages](./proxy/prometheus)
-        - ✅ [`x-ratelimit-remaining-requests`, `x-ratelimit-remaining-tokens` for LLM APIs on Prometheus](./proxy/prometheus#✨-enterprise-llm-remaining-requests-and-remaining-tokens)
-    - **Custom Branding**
-        - ✅ [Custom Branding + Routes on Swagger Docs](./proxy/enterprise#swagger-docs---custom-routes--branding)
-        - ✅ [Public Model Hub](../docs/proxy/enterprise.md#public-model-hub)
-        - ✅ [Custom Email Branding](../docs/proxy/email.md#customizing-email-branding)
+- [**Enterprise Features**](./proxy/enterprise)
 - ✅ **Feature Prioritization**
 - ✅ **Custom Integrations**
 - ✅ **Professional Support - Dedicated discord + slack**


+Deployment Options:
+
+**Self-Hosted**
+1. Manage Yourself - you can deploy our Docker Image or build a custom image from our pip package, and manage your own infrastructure. In this case, we would give you a license key + provide support via a dedicated support channel. 
+
+2. We Manage - you give us subscription access on your AWS/Azure/GCP account, and we manage the deployment.
+
+**Managed**
+
+You can use our cloud product where we setup a dedicated instance for you. 

 ## Frequently Asked Questions

-### What topics does Professional support cover and what SLAs do you offer?
+### SLA's + Professional Support

 Professional Support can assist with LLM/Provider integrations, deployment, upgrade management, and LLM Provider troubleshooting.  We can’t solve your own infrastructure-related issues but we will guide you to fix them.

 - 1 hour for Sev0 issues
 - 6 hours for Sev1
 - 24h for Sev2-Sev3 between 7am – 7pm PT (Monday through Saturday)
+- 72h SLA for patching vulnerabilities in the software. 

 **We can offer custom SLAs** based on your needs and the severity of the issue

@ -75,4 +54,8 @@ You just deploy [our docker image](https://docs.litellm.ai/docs/proxy/deploy) an
 LITELLM_LICENSE="eyJ..."
 ```

-No data leaves your environment. 
+No data leaves your environment. 
+
+## Data Security / Legal / Compliance FAQs
+
+[Data Security / Legal / Compliance FAQs](./data_security.md)
--- a/docs/my-website/docs/files_endpoints.md
+++ b/docs/my-website/docs/files_endpoints.md
@ -0,0 +1,127 @@
+
+import TabItem from '@theme/TabItem';
+import Tabs from '@theme/Tabs';
+
+# Files API
+
+Files are used to upload documents that can be used with features like Assistants, Fine-tuning, and Batch API.
+
+## Quick Start
+
+- Upload a File
+- List Files
+- Retrieve File Information
+- Delete File
+- Get File Content
+
+<Tabs>
+<TabItem value="proxy" label="LiteLLM PROXY Server">
+
+```bash
+$ export OPENAI_API_KEY="sk-..."
+
+$ litellm
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+**Upload a File**
+```bash
+curl http://localhost:4000/v1/files \
+  -H "Authorization: Bearer sk-1234" \
+  -F purpose="fine-tune" \
+  -F file="@mydata.jsonl"
+```
+
+**List Files**
+```bash
+curl http://localhost:4000/v1/files \
+  -H "Authorization: Bearer sk-1234"
+```
+
+**Retrieve File Information**
+```bash
+curl http://localhost:4000/v1/files/file-abc123 \
+  -H "Authorization: Bearer sk-1234"
+```
+
+**Delete File**
+```bash
+curl http://localhost:4000/v1/files/file-abc123 \
+  -X DELETE \
+  -H "Authorization: Bearer sk-1234"
+```
+
+**Get File Content**
+```bash
+curl http://localhost:4000/v1/files/file-abc123/content \
+  -H "Authorization: Bearer sk-1234"
+```
+
+</TabItem>
+<TabItem value="sdk" label="SDK">
+
+**Upload a File**
+```python
+from litellm
+import os 
+
+os.environ["OPENAI_API_KEY"] = "sk-.."
+
+file_obj = await litellm.acreate_file(
+    file=open("mydata.jsonl", "rb"),
+    purpose="fine-tune",
+    custom_llm_provider="openai",
+)
+print("Response from creating file=", file_obj)
+```
+
+**List Files**
+```python
+files = await litellm.alist_files(
+    custom_llm_provider="openai",
+    limit=10
+)
+print("files=", files)
+```
+
+**Retrieve File Information**
+```python
+file = await litellm.aretrieve_file(
+    file_id="file-abc123",
+    custom_llm_provider="openai"
+)
+print("file=", file)
+```
+
+**Delete File**
+```python
+response = await litellm.adelete_file(
+    file_id="file-abc123",
+    custom_llm_provider="openai"
+)
+print("delete response=", response)
+```
+
+**Get File Content**
+```python
+content = await litellm.afile_content(
+    file_id="file-abc123",
+    custom_llm_provider="openai"
+)
+print("file content=", content)
+```
+
+</TabItem>
+</Tabs>
+
+
+## **Supported Providers**:
+
+### [OpenAI](#quick-start)
+
+## [Azure OpenAI](./providers/azure#azure-batches-api)
+
+### [Vertex AI](./providers/vertex#batch-apis)
+
+## [Swagger API Reference](https://litellm-api.up.railway.app/#/files)
--- a/docs/my-website/docs/fine_tuning.md
+++ b/docs/my-website/docs/fine_tuning.md
@ -10,10 +10,12 @@ This is an Enterprise only endpoint [Get Started with Enterprise here](https://c

 :::

-## Supported Providers
- Azure OpenAI
- OpenAI
- Vertex AI
+| Feature | Supported | Notes | 
+|-------|-------|-------|
+| Supported Providers | OpenAI, Azure OpenAI, Vertex AI | - |
+| Cost Tracking | 🟡 | [Let us know if you need this](https://github.com/BerriAI/litellm/issues) |
+| Logging | ✅ | Works across all logging integrations |
+

 Add `finetune_settings` and `files_settings` to your litellm config.yaml to use the fine-tuning endpoints.
 ## Example config.yaml for `finetune_settings` and `files_settings`
@ -110,58 +112,6 @@ curl http://localhost:4000/v1/fine_tuning/jobs \

 </TabItem>

-<TabItem value="Vertex" label="VertexAI">
-
-<Tabs>
-<TabItem value="openai" label="OpenAI Python SDK">
-
-```python
-ft_job = await client.fine_tuning.jobs.create(
-    model="gemini-1.0-pro-002",                  # Vertex model you want to fine-tune
-    training_file="gs://cloud-samples-data/ai-platform/generative_ai/sft_train_data.jsonl",                 # file_id from create file response
-    extra_body={"custom_llm_provider": "vertex_ai"}, # tell litellm proxy which provider to use
-)
-```
-</TabItem>
-
-<TabItem value="curl" label="curl (Unified API)">
-
-```shell
-curl http://localhost:4000/v1/fine_tuning/jobs \
-    -H "Content-Type: application/json" \
-    -H "Authorization: Bearer sk-1234" \
-    -d '{
-    "custom_llm_provider": "vertex_ai",
-    "model": "gemini-1.0-pro-002",
-    "training_file": "gs://cloud-samples-data/ai-platform/generative_ai/sft_train_data.jsonl"
-    }'
-```
-</TabItem>
-
-<TabItem value="curl-vtx" label="curl (VertexAI API)">
-
-:::info
-
-Use this to create Fine tuning Jobs in [the Vertex AI API Format](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/tuning#create-tuning)
-
-:::
-
-```shell
-curl http://localhost:4000/v1/projects/tuningJobs \
-      -H "Content-Type: application/json" \
-      -H "Authorization: Bearer sk-1234" \
-      -d '{
-  "baseModel": "gemini-1.0-pro-002",
-  "supervisedTuningSpec" : {
-      "training_dataset_uri": "gs://cloud-samples-data/ai-platform/generative_ai/sft_train_data.jsonl"
-  }
-}'
-```
-
-</TabItem>
-</Tabs>
-
-</TabItem>
 </Tabs>

 ### Request Body
--- a/docs/my-website/docs/getting_started.md
+++ b/docs/my-website/docs/getting_started.md
@ -80,13 +80,13 @@ except OpenAIError as e:

 ## Logging Observability - Log LLM Input/Output ([Docs](https://docs.litellm.ai/docs/observability/callbacks))

-LiteLLM exposes pre defined callbacks to send data to Lunary, Langfuse, Helicone, Promptlayer, Traceloop, Slack
+LiteLLM exposes pre defined callbacks to send data to MLflow, Lunary, Langfuse, Helicone, Promptlayer, Traceloop, Slack

 ```python
 from litellm import completion

-## set env variables for logging tools
-os.environ["LUNARY_PUBLIC_KEY"] = "your-lunary-public-key"
+## set env variables for logging tools (API key set up is not required when using MLflow)
+os.environ["LUNARY_PUBLIC_KEY"] = "your-lunary-public-key" # get your public key at https://app.lunary.ai/settings
 os.environ["HELICONE_API_KEY"] = "your-helicone-key"
 os.environ["LANGFUSE_PUBLIC_KEY"] = ""
 os.environ["LANGFUSE_SECRET_KEY"] = ""
@ -94,7 +94,7 @@ os.environ["LANGFUSE_SECRET_KEY"] = ""
 os.environ["OPENAI_API_KEY"]

 # set callbacks
-litellm.success_callback = ["lunary", "langfuse", "helicone"] # log input/output to langfuse, lunary, supabase, helicone
+litellm.success_callback = ["lunary", "mlflow", "langfuse", "helicone"] # log input/output to MLflow, langfuse, lunary, helicone

 #openai call
 response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])
--- a/docs/my-website/docs/image_variations.md
+++ b/docs/my-website/docs/image_variations.md
@ -0,0 +1,31 @@
+# [BETA] Image Variations
+
+OpenAI's `/image/variations` endpoint is now supported.
+
+## Quick Start
+
+```python
+from litellm import image_variation
+import os 
+
+# set env vars 
+os.environ["OPENAI_API_KEY"] = ""
+os.environ["TOPAZ_API_KEY"] = ""
+
+# openai call
+response = image_variation(
+    model="dall-e-2", image=image_url
+)
+
+# topaz call
+response = image_variation(
+    model="topaz/Standard V2", image=image_url
+)
+
+print(response)
+```
+
+## Supported Providers
+
+- OpenAI
+- Topaz
--- a/docs/my-website/docs/index.md
+++ b/docs/my-website/docs/index.md
@ -67,7 +67,7 @@ import os
 os.environ["OPENAI_API_KEY"] = "your-api-key"

 response = completion(
-  model="gpt-3.5-turbo",
+  model="openai/gpt-4o",
  messages=[{ "content": "Hello, how are you?","role": "user"}]
 )
 ```
@ -83,13 +83,27 @@ import os
 os.environ["ANTHROPIC_API_KEY"] = "your-api-key"

 response = completion(
-  model="claude-2",
+  model="anthropic/claude-3-sonnet-20240229",
  messages=[{ "content": "Hello, how are you?","role": "user"}]
 )
 ```

 </TabItem>
+<TabItem value="xai" label="xAI">

+```python
+from litellm import completion
+import os
+
+## set ENV variables
+os.environ["XAI_API_KEY"] = "your-api-key"
+
+response = completion(
+  model="xai/grok-2-latest",
+  messages=[{ "content": "Hello, how are you?","role": "user"}]
+)
+```
+</TabItem>
 <TabItem value="vertex" label="VertexAI">

 ```python
@ -101,7 +115,25 @@ os.environ["VERTEX_PROJECT"] = "hardy-device-386718"
 os.environ["VERTEX_LOCATION"] = "us-central1"

 response = completion(
-  model="chat-bison",
+  model="vertex_ai/gemini-1.5-pro",
+  messages=[{ "content": "Hello, how are you?","role": "user"}]
+)
+```
+
+</TabItem>
+
+<TabItem value="nvidia" label="NVIDIA">
+
+```python
+from litellm import completion
+import os
+
+## set ENV variables
+os.environ["NVIDIA_NIM_API_KEY"] = "nvidia_api_key"
+os.environ["NVIDIA_NIM_API_BASE"] = "nvidia_nim_endpoint_url"
+
+response = completion(
+  model="nvidia_nim/<model_name>",
  messages=[{ "content": "Hello, how are you?","role": "user"}]
 )
 ```
@ -180,6 +212,42 @@ response = completion(

 </Tabs>

+### Response Format (OpenAI Format)
+
+```json
+{
+    "id": "chatcmpl-565d891b-a42e-4c39-8d14-82a1f5208885",
+    "created": 1734366691,
+    "model": "claude-3-sonnet-20240229",
+    "object": "chat.completion",
+    "system_fingerprint": null,
+    "choices": [
+        {
+            "finish_reason": "stop",
+            "index": 0,
+            "message": {
+                "content": "Hello! As an AI language model, I don't have feelings, but I'm operating properly and ready to assist you with any questions or tasks you may have. How can I help you today?",
+                "role": "assistant",
+                "tool_calls": null,
+                "function_call": null
+            }
+        }
+    ],
+    "usage": {
+        "completion_tokens": 43,
+        "prompt_tokens": 13,
+        "total_tokens": 56,
+        "completion_tokens_details": null,
+        "prompt_tokens_details": {
+            "audio_tokens": null,
+            "cached_tokens": 0
+        },
+        "cache_creation_input_tokens": 0,
+        "cache_read_input_tokens": 0
+    }
+}
+```
+
 ### Streaming
 Set `stream=True` in the `completion` args. 

@ -194,7 +262,7 @@ import os
 os.environ["OPENAI_API_KEY"] = "your-api-key"

 response = completion(
-  model="gpt-3.5-turbo",
+  model="openai/gpt-4o",
  messages=[{ "content": "Hello, how are you?","role": "user"}],
  stream=True,
 )
@ -211,14 +279,29 @@ import os
 os.environ["ANTHROPIC_API_KEY"] = "your-api-key"

 response = completion(
-  model="claude-2",
+  model="anthropic/claude-3-sonnet-20240229",
  messages=[{ "content": "Hello, how are you?","role": "user"}],
  stream=True,
 )
 ```

 </TabItem>
+<TabItem value="xai" label="xAI">

+```python
+from litellm import completion
+import os
+
+## set ENV variables
+os.environ["XAI_API_KEY"] = "your-api-key"
+
+response = completion(
+  model="xai/grok-2-latest",
+  messages=[{ "content": "Hello, how are you?","role": "user"}],
+  stream=True,
+)
+```
+</TabItem>
 <TabItem value="vertex" label="VertexAI">

 ```python
@ -230,7 +313,7 @@ os.environ["VERTEX_PROJECT"] = "hardy-device-386718"
 os.environ["VERTEX_LOCATION"] = "us-central1"

 response = completion(
-  model="chat-bison",
+  model="vertex_ai/gemini-1.5-pro",
  messages=[{ "content": "Hello, how are you?","role": "user"}],
  stream=True,
 )
@ -238,6 +321,24 @@ response = completion(

 </TabItem>

+<TabItem value="nvidia" label="NVIDIA">
+
+```python
+from litellm import completion
+import os
+
+## set ENV variables
+os.environ["NVIDIA_NIM_API_KEY"] = "nvidia_api_key"
+os.environ["NVIDIA_NIM_API_BASE"] = "nvidia_nim_endpoint_url"
+
+response = completion(
+  model="nvidia_nim/<model_name>",
+  messages=[{ "content": "Hello, how are you?","role": "user"}]
+  stream=True,
+)
+```
+</TabItem>
+
 <TabItem value="hugging" label="HuggingFace">

 ```python
@ -314,6 +415,32 @@ response = completion(

 </Tabs>

+### Streaming Response Format (OpenAI Format)
+
+```json
+{
+    "id": "chatcmpl-2be06597-eb60-4c70-9ec5-8cd2ab1b4697",
+    "created": 1734366925,
+    "model": "claude-3-sonnet-20240229",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": null,
+    "choices": [
+        {
+            "finish_reason": null,
+            "index": 0,
+            "delta": {
+                "content": "Hello",
+                "role": "assistant",
+                "function_call": null,
+                "tool_calls": null,
+                "audio": null
+            },
+            "logprobs": null
+        }
+    ]
+}
+```
+
 ### Exception handling 

 LiteLLM maps exceptions across all supported providers to the OpenAI exceptions. All our exceptions inherit from OpenAI's exception types, so any error-handling you have for that, should work out of the box with LiteLLM.
@ -331,21 +458,21 @@ except OpenAIError as e:
 ```

 ### Logging Observability - Log LLM Input/Output ([Docs](https://docs.litellm.ai/docs/observability/callbacks))
-LiteLLM exposes pre defined callbacks to send data to Lunary, Langfuse, Helicone, Promptlayer, Traceloop, Slack
+LiteLLM exposes pre defined callbacks to send data to Lunary, MLflow, Langfuse, Helicone, Promptlayer, Traceloop, Slack

 ```python
 from litellm import completion

-## set env variables for logging tools
+## set env variables for logging tools (API key set up is not required when using MLflow)
+os.environ["LUNARY_PUBLIC_KEY"] = "your-lunary-public-key" # get your public key at https://app.lunary.ai/settings
 os.environ["HELICONE_API_KEY"] = "your-helicone-key"
 os.environ["LANGFUSE_PUBLIC_KEY"] = ""
 os.environ["LANGFUSE_SECRET_KEY"] = ""
-os.environ["LUNARY_PUBLIC_KEY"] = "your-lunary-public-key"

 os.environ["OPENAI_API_KEY"]

 # set callbacks
-litellm.success_callback = ["lunary", "langfuse", "helicone"] # log input/output to lunary, langfuse, supabase, helicone
+litellm.success_callback = ["lunary", "mlflow", "langfuse", "helicone"] # log input/output to lunary, mlflow, langfuse, helicone

 #openai call
 response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])
--- a/docs/my-website/docs/langchain/langchain.md
+++ b/docs/my-website/docs/langchain/langchain.md
@ -111,5 +111,54 @@ chat.invoke(messages)
 </TabItem>
 </Tabs>

+## Use Langchain ChatLiteLLM with MLflow
+
+MLflow provides open-source observability solution for ChatLiteLLM.
+
+To enable the integration, simply call `mlflow.litellm.autolog()` before in your code. No other setup is necessary.
+
+```python
+import mlflow
+
+mlflow.litellm.autolog()
+```
+
+Once the auto-tracing is enabled, you can invoke `ChatLiteLLM` and see recorded traces in MLflow.
+
+```python
+import os
+from langchain.chat_models import ChatLiteLLM
+
+os.environ['OPENAI_API_KEY']="sk-..."
+
+chat = ChatLiteLLM(model="gpt-4o-mini")
+chat.invoke("Hi!")
+```
+
+## Use Langchain ChatLiteLLM with Lunary
+```python
+import os
+from langchain.chat_models import ChatLiteLLM
+from langchain.schema import HumanMessage
+import litellm
+
+os.environ["LUNARY_PUBLIC_KEY"] = "" # from https://app.lunary.ai/settings
+os.environ['OPENAI_API_KEY']="sk-..."
+
+litellm.success_callback = ["lunary"] 
+litellm.failure_callback = ["lunary"] 
+
+chat = ChatLiteLLM(
+  model="gpt-4o"
+  messages = [
+    HumanMessage(
+        content="what model are you"
+    )
+]
+chat(messages)
+```
+
+Get more details [here](../observability/lunary_integration.md)
+
 ## Use LangChain ChatLiteLLM + Langfuse
 Checkout this section [here](../observability/langfuse_integration#use-langchain-chatlitellm--langfuse) for more details on how to integrate Langfuse with ChatLiteLLM.
--- a/docs/my-website/docs/load_test_advanced.md
+++ b/docs/my-website/docs/load_test_advanced.md
@ -25,6 +25,18 @@ Tutorial on how to get to 1K+ RPS with LiteLLM Proxy on locust
        callbacks: ["prometheus"] # Enterprise LiteLLM Only - use prometheus to get metrics on your load test
    ```

+**Use this config for testing:**
+
+**Note:**  we're currently migrating to aiohttp which has 10x higher throughput. We recommend using the `aiohttp_openai/` provider for load testing.
+
+```yaml
+model_list:
+  - model_name: "fake-openai-endpoint"
+    litellm_params:
+      model: aiohttp_openai/any
+      api_base: https://your-fake-openai-endpoint.com/chat/completions
+      api_key: "test"
+```


 ## Load Test - Fake OpenAI Endpoint
@ -46,7 +58,7 @@ litellm provides a hosted `fake-openai-endpoint` you can load test against
 model_list:
  - model_name: fake-openai-endpoint
    litellm_params:
-      model: openai/fake
+      model: aiohttp_openai/fake
      api_key: fake-key
      api_base: https://exampleopenaiendpoint-production.up.railway.app/

@ -170,7 +182,7 @@ Use the following [prometheus metrics to debug your load tests / failures](./pro

 ## Machine Specifications for Running LiteLLM Proxy

-👉 **Number of Replicas of LiteLLM Proxy=20** for getting 1K+ RPS
+👉 **Number of Replicas of LiteLLM Proxy=4** for getting 1K+ RPS

 | Service | Spec | CPUs | Memory | Architecture | Version|
 | --- | --- | --- | --- | --- | --- | 
--- a/docs/my-website/docs/observability/arize_integration.md
+++ b/docs/my-website/docs/observability/arize_integration.md
@ -19,6 +19,7 @@ Make an account on [Arize AI](https://app.arize.com/auth/login)
 ## Quick Start
 Use just 2 lines of code, to instantly log your responses **across all providers** with arize

+You can also use the instrumentor option instead of the callback, which you can find [here](https://docs.arize.com/arize/llm-tracing/tracing-integrations-auto/litellm).

 ```python
 litellm.callbacks = ["arize"]
@ -28,7 +29,7 @@ import litellm
 import os

 os.environ["ARIZE_SPACE_KEY"] = ""
-os.environ["ARIZE_API_KEY"] = "" # defaults to litellm-completion
+os.environ["ARIZE_API_KEY"] = ""

 # LLM API Keys
 os.environ['OPENAI_API_KEY']=""
--- a/docs/my-website/docs/observability/athina_integration.md
+++ b/docs/my-website/docs/observability/athina_integration.md
@ -78,6 +78,17 @@ Following are the allowed fields in metadata, their types, and their description
 * `context: Optional[Union[dict, str]]` - This is the context used as information for the prompt. For RAG applications, this is the "retrieved" data. You may log context as a string or as an object (dictionary).
 * `expected_response: Optional[str]` - This is the reference response to compare against for evaluation purposes. This is useful for segmenting inference calls by expected response.
 * `user_query: Optional[str]` - This is the user's query. For conversational applications, this is the user's last message.
+* `custom_attributes: Optional[dict]` - This is a dictionary of custom attributes. This is useful for additional information about the inference.
+
+## Using a self hosted deployment of Athina
+
+If you are using a self hosted deployment of Athina, you will need to set the `ATHINA_BASE_URL` environment variable to point to your self hosted deployment.
+
+```python
+...
+os.environ["ATHINA_BASE_URL"]= "http://localhost:9000"
+...
+```

 ## Support & Talk with Athina Team

--- a/docs/my-website/docs/observability/braintrust.md
+++ b/docs/my-website/docs/observability/braintrust.md
@ -67,7 +67,7 @@ curl -X POST 'http://0.0.0.0:4000/chat/completions' \
 }'
 ```

-## Advanced - pass Project ID 
+## Advanced - pass Project ID or name

 <Tabs>
 <TabItem value="sdk" label="SDK">
@ -79,7 +79,10 @@ response = litellm.completion(
    {"role": "user", "content": "Hi 👋 - i'm openai"}
  ], 
  metadata={
-    "project_id": "my-special-project" 
+    "project_id": "1234",
+    # passing project_name will try to find a project with that name, or create one if it doesn't exist
+    # if both project_id and project_name are passed, project_id will be used
+    # "project_name": "my-special-project" 
  }
 )
 ```
--- a/docs/my-website/docs/observability/callbacks.md
+++ b/docs/my-website/docs/observability/callbacks.md
@ -7,11 +7,11 @@ liteLLM provides `input_callbacks`, `success_callbacks` and `failure_callbacks`,
 liteLLM supports:

 - [Custom Callback Functions](https://docs.litellm.ai/docs/observability/custom_callback)
+- [Lunary](https://lunary.ai/docs)
 - [Langfuse](https://langfuse.com/docs)
 - [LangSmith](https://www.langchain.com/langsmith)
 - [Helicone](https://docs.helicone.ai/introduction)
 - [Traceloop](https://traceloop.com/docs)
- [Lunary](https://lunary.ai/docs)
 - [Athina](https://docs.athina.ai/)
 - [Sentry](https://docs.sentry.io/platforms/python/)
 - [PostHog](https://posthog.com/docs/libraries/python)
@ -30,6 +30,7 @@ litellm.success_callback=["posthog", "helicone", "langfuse", "lunary", "athina"]
 litellm.failure_callback=["sentry", "lunary", "langfuse"]

 ## set env variables
+os.environ['LUNARY_PUBLIC_KEY'] = ""
 os.environ['SENTRY_DSN'], os.environ['SENTRY_API_TRACE_RATE']= ""
 os.environ['POSTHOG_API_KEY'], os.environ['POSTHOG_API_URL'] = "api-key", "api-url"
 os.environ["HELICONE_API_KEY"] = ""
--- a/docs/my-website/docs/observability/custom_callback.md
+++ b/docs/my-website/docs/observability/custom_callback.md
@ -20,9 +20,7 @@ class MyCustomHandler(CustomLogger):
    def log_post_api_call(self, kwargs, response_obj, start_time, end_time): 
        print(f"Post-API Call")
    
-    def log_stream_event(self, kwargs, response_obj, start_time, end_time):
-        print(f"On Stream")
-        
+
    def log_success_event(self, kwargs, response_obj, start_time, end_time): 
        print(f"On Success")

@ -30,9 +28,6 @@ class MyCustomHandler(CustomLogger):
        print(f"On Failure")
    
    #### ASYNC #### - for acompletion/aembeddings
-    
-    async def async_log_stream_event(self, kwargs, response_obj, start_time, end_time):
-        print(f"On Async Streaming")

    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
        print(f"On Async Success")
@ -127,8 +122,7 @@ from litellm import acompletion
 class MyCustomHandler(CustomLogger):
    #### ASYNC #### 
    
-    async def async_log_stream_event(self, kwargs, response_obj, start_time, end_time):
-        print(f"On Async Streaming")
+

    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
        print(f"On Async Success")
--- a/docs/my-website/docs/observability/humanloop.md
+++ b/docs/my-website/docs/observability/humanloop.md
@ -0,0 +1,176 @@
+import Image from '@theme/IdealImage';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Humanloop
+
+[Humanloop](https://humanloop.com/docs/v5/getting-started/overview) enables product teams to build robust AI features with LLMs, using best-in-class tooling for Evaluation, Prompt Management, and Observability.
+
+
+## Getting Started
+
+Use Humanloop to manage prompts across all LiteLLM Providers.
+
+
+
+<Tabs>
+
+<TabItem value="sdk" label="SDK">
+
+```python
+import os 
+import litellm
+
+os.environ["HUMANLOOP_API_KEY"] = "" # [OPTIONAL] set here or in `.completion`
+
+litellm.set_verbose = True # see raw request to provider
+
+resp = litellm.completion(
+    model="humanloop/gpt-3.5-turbo",
+    prompt_id="test-chat-prompt",
+    prompt_variables={"user_message": "this is used"}, # [OPTIONAL]
+    messages=[{"role": "user", "content": "<IGNORED>"}],
+    # humanloop_api_key="..." ## alternative to setting env var
+)
+```
+
+
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+  - model_name: gpt-3.5-turbo
+    litellm_params:
+      model: humanloop/gpt-3.5-turbo
+      prompt_id: "<humanloop_prompt_id>"
+      api_key: os.environ/OPENAI_API_KEY
+```
+
+2. Start the proxy
+
+```bash
+litellm --config config.yaml --detailed_debug
+```
+
+3. Test it! 
+
+<Tabs>
+<TabItem value="curl" label="CURL">
+
+```bash
+curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+    "model": "gpt-3.5-turbo",
+    "messages": [
+        {
+            "role": "user",
+            "content": "THIS WILL BE IGNORED"
+        }
+    ],
+    "prompt_variables": {
+        "key": "this is used"
+    }
+}'
+```
+</TabItem>
+<TabItem value="OpenAI Python SDK" label="OpenAI Python SDK">
+
+```python
+import openai
+client = openai.OpenAI(
+    api_key="anything",
+    base_url="http://0.0.0.0:4000"
+)
+
+# request sent to model set on litellm proxy, `litellm --model`
+response = client.chat.completions.create(
+    model="gpt-3.5-turbo",
+    messages = [
+        {
+            "role": "user",
+            "content": "this is a test request, write a short poem"
+        }
+    ],
+    extra_body={
+        "prompt_variables": { # [OPTIONAL]
+            "key": "this is used"
+        }
+    }
+)
+
+print(response)
+```
+
+</TabItem>
+</Tabs>
+
+</TabItem>
+</Tabs>
+
+
+**Expected Logs:**
+
+```
+POST Request Sent from LiteLLM:
+curl -X POST \
+https://api.openai.com/v1/ \
+-d '{'model': 'gpt-3.5-turbo', 'messages': <YOUR HUMANLOOP PROMPT TEMPLATE>}'
+```
+
+## How to set model 
+
+
+## How to set model 
+
+### Set the model on LiteLLM 
+
+You can do `humanloop/<litellm_model_name>`
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+litellm.completion(
+    model="humanloop/gpt-3.5-turbo", # or `humanloop/anthropic/claude-3-5-sonnet`
+    ...
+)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+```yaml
+model_list:
+  - model_name: gpt-3.5-turbo
+    litellm_params:
+      model: humanloop/gpt-3.5-turbo # OR humanloop/anthropic/claude-3-5-sonnet
+      prompt_id: <humanloop_prompt_id>
+      api_key: os.environ/OPENAI_API_KEY
+```
+
+</TabItem>
+</Tabs>
+
+### Set the model on Humanloop
+
+LiteLLM will call humanloop's `https://api.humanloop.com/v5/prompts/<your-prompt-id>` endpoint, to get the prompt template.
+
+This also returns the template model set on Humanloop.
+
+```bash
+{
+  "template": [
+    {
+      ... # your prompt template
+    }
+  ],
+  "model": "gpt-3.5-turbo" # your template model
+}
+```
+
--- a/docs/my-website/docs/observability/langsmith_integration.md
+++ b/docs/my-website/docs/observability/langsmith_integration.md
@ -3,13 +3,6 @@ import Image from '@theme/IdealImage';
 # Langsmith - Logging LLM Input/Output


-:::tip
-
-This is community maintained, Please make an issue if you run into a bug
-https://github.com/BerriAI/litellm
-
-:::
-

 An all-in-one developer platform for every step of the application lifecycle
 https://smith.langchain.com/
@ -66,7 +59,7 @@ os.environ["LANGSMITH_API_KEY"] = ""
 # LLM API Keys
 os.environ['OPENAI_API_KEY']=""

-# set langfuse as a callback, litellm will send the data to langfuse
+# set langsmith as a callback, litellm will send the data to langsmith
 litellm.success_callback = ["langsmith"] 
 
 response = litellm.completion(
--- a/Show more
+++ b/Show more