Merge branch 'BerriAI:main' into main

2024-08-20 11:52:26 -07:00 · 2024-08-20 11:52:26 -07:00 · d71d19be1e
commit d71d19be1e
parent 1f6b32e3e0 2f2b42daba
547 changed files with 47310 additions and 24984 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -47,8 +47,8 @@ jobs:
            pip install opentelemetry-api==1.25.0
            pip install opentelemetry-sdk==1.25.0
            pip install opentelemetry-exporter-otlp==1.25.0
-            pip install openai
+            pip install openai==1.40.0
-            pip install prisma   
+            pip install prisma==0.11.0   
            pip install "detect_secrets==1.5.0"         
            pip install "httpx==0.24.1"
            pip install fastapi
@ -125,6 +125,7 @@ jobs:
            pip install tiktoken
            pip install aiohttp
            pip install click
            pip install "boto3==1.34.34"
            pip install jinja2
            pip install tokenizers
            pip install openai
@ -165,7 +166,6 @@ jobs:
            pip install "pytest==7.3.1"
            pip install "pytest-asyncio==0.21.1"
            pip install aiohttp
            pip install openai
            python -m pip install --upgrade pip
            python -m pip install -r .circleci/requirements.txt
            pip install "pytest==7.3.1"
@ -190,6 +190,7 @@ jobs:
            pip install "aiodynamo==23.10.1"
            pip install "asyncio==3.4.3"
            pip install "PyGithub==1.59.1"
            pip install "openai==1.40.0"
            # Run pytest and generate JUnit XML report
      - run:
          name: Build Docker image
@ -208,6 +209,9 @@ jobs:
              -e AZURE_EUROPE_API_KEY=$AZURE_EUROPE_API_KEY \
              -e MISTRAL_API_KEY=$MISTRAL_API_KEY \
              -e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
              -e GROQ_API_KEY=$GROQ_API_KEY \
              -e ANTHROPIC_API_KEY=$ANTHROPIC_API_KEY \
              -e COHERE_API_KEY=$COHERE_API_KEY \
              -e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
              -e AWS_REGION_NAME=$AWS_REGION_NAME \
              -e AUTO_INFER_REGION=True \
@ -280,10 +284,11 @@ jobs:
            pip install aiohttp
            pip install openai
            python -m pip install --upgrade pip
-            python -m pip install -r .circleci/requirements.txt
+            pip install "pydantic==2.7.1"
            pip install "pytest==7.3.1"
            pip install "pytest-mock==3.12.0"
            pip install "pytest-asyncio==0.21.1"
            pip install "boto3==1.34.34"
            pip install mypy
            pip install pyarrow
            pip install numpydoc
@ -312,6 +317,10 @@ jobs:
              -e OPENAI_API_KEY=$OPENAI_API_KEY \
              -e LITELLM_LICENSE=$LITELLM_LICENSE \
              -e OTEL_EXPORTER="in_memory" \
              -e APORIA_API_BASE_2=$APORIA_API_BASE_2 \
              -e APORIA_API_KEY_2=$APORIA_API_KEY_2 \
              -e APORIA_API_BASE_1=$APORIA_API_BASE_1 \
              -e APORIA_API_KEY_1=$APORIA_API_KEY_1 \
              --name my-app \
              -v $(pwd)/litellm/proxy/example_config_yaml/otel_test_config.yaml:/app/config.yaml \
              my-app:latest \
@ -404,7 +413,7 @@ jobs:
                circleci step halt
            fi
      - run:
-          name: Trigger Github Action for new Docker Container
+          name: Trigger Github Action for new Docker Container + Trigger Stable Release Testing
          command: |
            echo "Install TOML package."
            python3 -m pip install toml
@ -415,7 +424,8 @@ jobs:
              -H "Authorization: Bearer $GITHUB_TOKEN" \
              "https://api.github.com/repos/BerriAI/litellm/actions/workflows/ghcr_deploy.yml/dispatches" \
              -d "{\"ref\":\"main\", \"inputs\":{\"tag\":\"v${VERSION}\", \"commit_hash\":\"$CIRCLE_SHA1\"}}"
-
+            echo "triggering stable release server for version ${VERSION} and commit ${CIRCLE_SHA1}"
            curl -X POST "https://proxyloadtester-production.up.railway.app/start/load/test?version=${VERSION}&commit_hash=${CIRCLE_SHA1}"
 workflows:
  version: 2
  build_and_test:
--- a/.circleci/requirements.txt
+++ b/.circleci/requirements.txt
@ -1,11 +1,11 @@
 # used by CI/CD testing
-openai
+openai==1.34.0
 python-dotenv
 tiktoken
 importlib_metadata
 cohere
 redis
 anthropic
-orjson
+orjson==3.9.15
 pydantic==2.7.1
 google-cloud-aiplatform==1.43.0
--- a/.github/workflows/ghcr_deploy.yml
+++ b/.github/workflows/ghcr_deploy.yml
@ -21,6 +21,14 @@ env:
 # There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu.
 jobs:
  # print commit hash, tag, and release type
  print:
    runs-on: ubuntu-latest
    steps:
      - run: |
          echo "Commit hash: ${{ github.event.inputs.commit_hash }}"
          echo "Tag: ${{ github.event.inputs.tag }}"
          echo "Release type: ${{ github.event.inputs.release_type }}"
  docker-hub-deploy:
    if: github.repository == 'BerriAI/litellm'
    runs-on: ubuntu-latest
@ -186,6 +194,8 @@ jobs:
          platforms: local,linux/amd64,linux/arm64,linux/arm64/v8
  build-and-push-helm-chart:
    if: github.event.inputs.release_type  != 'dev'
    needs: [docker-hub-deploy, build-and-push-image, build-and-push-image-database]
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
@ -203,9 +213,17 @@ jobs:
      - name: lowercase github.repository_owner
        run: |
          echo "REPO_OWNER=`echo ${{github.repository_owner}} | tr '[:upper:]' '[:lower:]'`" >>${GITHUB_ENV}
      - name: Get LiteLLM Latest Tag
        id: current_app_tag
-        uses: WyriHaximus/github-action-get-previous-tag@v1.3.0
+        shell: bash
        run: |
          LATEST_TAG=$(git describe --tags --exclude "*dev*" --abbrev=0)
          if [ -z "${LATEST_TAG}" ]; then
            echo "latest_tag=latest" | tee -a $GITHUB_OUTPUT
          else
            echo "latest_tag=${LATEST_TAG}" | tee -a $GITHUB_OUTPUT
          fi
      - name: Get last published chart version
        id: current_version
@ -233,7 +251,7 @@ jobs:
          name: ${{ env.CHART_NAME }}
          repository: ${{ env.REPO_OWNER }}
          tag: ${{ github.event.inputs.chartVersion || steps.bump_version.outputs.next-version || '0.1.0' }}
-          app_version: ${{ steps.current_app_tag.outputs.tag || 'latest' }}
+          app_version: ${{ steps.current_app_tag.outputs.latest_tag }}
          path: deploy/charts/${{ env.CHART_NAME }}
          registry: ${{ env.REGISTRY }}
          registry_username: ${{ github.actor }}
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,7 @@
 .venv
 .env
 .newenv
 newenv/*
 litellm/proxy/myenv/*
 litellm_uuid.txt
 __pycache__/
--- a/5
+++ b/5
@ -62,6 +62,11 @@ COPY --from=builder /wheels/ /wheels/
 RUN pip install *.whl /wheels/* --no-index --find-links=/wheels/ && rm -f *.whl && rm -rf /wheels
 # Generate prisma client
 ENV PRISMA_BINARY_CACHE_DIR=/app/prisma
 RUN mkdir -p /.cache
 RUN chmod -R 777 /.cache
 RUN pip install nodejs-bin
 RUN pip install prisma
 RUN prisma generate
 RUN chmod +x entrypoint.sh
--- a/Dockerfile.custom_ui
+++ b/Dockerfile.custom_ui
@ -0,0 +1,41 @@
 # Use the provided base image
 FROM ghcr.io/berriai/litellm:litellm_fwd_server_root_path-dev
 # Set the working directory to /app
 WORKDIR /app
 # Install Node.js and npm (adjust version as needed)
 RUN apt-get update && apt-get install -y nodejs npm
 # Copy the UI source into the container
 COPY ./ui/litellm-dashboard /app/ui/litellm-dashboard
 # Set an environment variable for UI_BASE_PATH
 # This can be overridden at build time
 # set UI_BASE_PATH to "<your server root path>/ui"
 ENV UI_BASE_PATH="/prod/ui"
 # Build the UI with the specified UI_BASE_PATH
 WORKDIR /app/ui/litellm-dashboard
 RUN npm install
 RUN UI_BASE_PATH=$UI_BASE_PATH npm run build
 # Create the destination directory
 RUN mkdir -p /app/litellm/proxy/_experimental/out
 # Move the built files to the appropriate location
 # Assuming the build output is in ./out directory
 RUN rm -rf /app/litellm/proxy/_experimental/out/* && \
    mv ./out/* /app/litellm/proxy/_experimental/out/
 # Switch back to the main app directory
 WORKDIR /app
 # Make sure your entrypoint.sh is executable
 RUN chmod +x entrypoint.sh
 # Expose the necessary port
 EXPOSE 4000/tcp
 # Override the CMD instruction with your desired command and arguments
 CMD ["--port", "4000", "--config", "config.yaml", "--detailed_debug"]
--- a/Dockerfile.database
+++ b/Dockerfile.database
@ -62,6 +62,11 @@ RUN pip install PyJWT --no-cache-dir
 RUN chmod +x build_admin_ui.sh && ./build_admin_ui.sh
 # Generate prisma client
 ENV PRISMA_BINARY_CACHE_DIR=/app/prisma
 RUN mkdir -p /.cache
 RUN chmod -R 777 /.cache
 RUN pip install nodejs-bin
 RUN pip install prisma
 RUN prisma generate
 RUN chmod +x entrypoint.sh
--- a/README.md
+++ b/README.md
@ -8,10 +8,10 @@
          <img src="https://railway.app/button.svg" alt="Deploy on Railway">
        </a>
        </p>
-        <p align="center">Call all LLM APIs using the OpenAI format [Bedrock, Huggingface, VertexAI, TogetherAI, Azure, OpenAI, etc.]
+        <p align="center">Call all LLM APIs using the OpenAI format [Bedrock, Huggingface, VertexAI, TogetherAI, Azure, OpenAI, Groq etc.]
        <br>
    </p>
-<h4 align="center"><a href="https://docs.litellm.ai/docs/simple_proxy" target="_blank">OpenAI Proxy Server</a> | <a href="https://docs.litellm.ai/docs/hosted" target="_blank"> Hosted Proxy (Preview)</a> | <a href="https://docs.litellm.ai/docs/enterprise"target="_blank">Enterprise Tier</a></h4>
+<h4 align="center"><a href="https://docs.litellm.ai/docs/simple_proxy" target="_blank">LiteLLM Proxy Server (LLM Gateway)</a> | <a href="https://docs.litellm.ai/docs/hosted" target="_blank"> Hosted Proxy (Preview)</a> | <a href="https://docs.litellm.ai/docs/enterprise"target="_blank">Enterprise Tier</a></h4>
 <h4 align="center">
    <a href="https://pypi.org/project/litellm/" target="_blank">
        <img src="https://img.shields.io/pypi/v/litellm.svg" alt="PyPI Version">
@ -35,9 +35,9 @@ LiteLLM manages:
 - Translate inputs to provider's `completion`, `embedding`, and `image_generation` endpoints
 - [Consistent output](https://docs.litellm.ai/docs/completion/output), text responses will always be available at `['choices'][0]['message']['content']`
 - Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing)
- Set Budgets & Rate limits per project, api key, model [OpenAI Proxy Server](https://docs.litellm.ai/docs/simple_proxy)
+- Set Budgets & Rate limits per project, api key, model [LiteLLM Proxy Server (LLM Gateway)](https://docs.litellm.ai/docs/simple_proxy)
-[**Jump to OpenAI Proxy Docs**](https://github.com/BerriAI/litellm?tab=readme-ov-file#openai-proxy---docs) <br>
+[**Jump to LiteLLM Proxy (LLM Gateway) Docs**](https://github.com/BerriAI/litellm?tab=readme-ov-file#openai-proxy---docs) <br>
 [**Jump to Supported LLM Providers**](https://github.com/BerriAI/litellm?tab=readme-ov-file#supported-providers-docs)
 🚨 **Stable Release:** Use docker images with the `-stable` tag. These have undergone 12 hour load tests, before being published. 
@ -120,6 +120,7 @@ from litellm import completion
 ## set env variables for logging tools
 os.environ["LUNARY_PUBLIC_KEY"] = "your-lunary-public-key"
 os.environ["HELICONE_API_KEY"] = "your-helicone-auth-key"
 os.environ["LANGFUSE_PUBLIC_KEY"] = ""
 os.environ["LANGFUSE_SECRET_KEY"] = ""
 os.environ["ATHINA_API_KEY"] = "your-athina-api-key"
@ -127,13 +128,13 @@ os.environ["ATHINA_API_KEY"] = "your-athina-api-key"
 os.environ["OPENAI_API_KEY"]
 # set callbacks
-litellm.success_callback = ["lunary", "langfuse", "athina"] # log input/output to lunary, langfuse, supabase, athina etc
+litellm.success_callback = ["lunary", "langfuse", "athina", "helicone"] # log input/output to lunary, langfuse, supabase, athina, helicone etc
 #openai call
 response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])
 ```
-# OpenAI Proxy - ([Docs](https://docs.litellm.ai/docs/simple_proxy))
+# LiteLLM Proxy Server (LLM Gateway) - ([Docs](https://docs.litellm.ai/docs/simple_proxy))
 Track spend + Load Balance across multiple projects
@ -165,6 +166,10 @@ $ litellm --model huggingface/bigcode/starcoder
 ### Step 2: Make ChatCompletions Request to Proxy
 > [!IMPORTANT]
 > 💡 [Use LiteLLM Proxy with Langchain (Python, JS), OpenAI SDK (Python, JS) Anthropic SDK, Mistral SDK, LlamaIndex, Instructor, Curl](https://docs.litellm.ai/docs/proxy/user_keys)  
 ```python
 import openai # openai v1.0.0+
 client = openai.OpenAI(api_key="anything",base_url="http://0.0.0.0:4000") # set proxy to base_url
@ -190,8 +195,15 @@ git clone https://github.com/BerriAI/litellm
 # Go to folder
 cd litellm
-# Add the master key
+# Add the master key - you can change this after setup
 echo 'LITELLM_MASTER_KEY="sk-1234"' > .env
 # Add the litellm salt key - you cannot change this after adding a model
 # It is used to encrypt / decrypt your LLM API Key credentials
 # We recommned - https://1password.com/password-generator/ 
 # password generator to get a random hash for litellm salt key
 echo 'LITELLM_SALT_KEY="sk-1234"' > .env
 source .env
 # Start
--- a/cookbook/Migrating_to_LiteLLM_Proxy_from_OpenAI_Azure_OpenAI.ipynb
+++ b/cookbook/Migrating_to_LiteLLM_Proxy_from_OpenAI_Azure_OpenAI.ipynb
@ -0,0 +1,565 @@
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "# Migrating to LiteLLM Proxy from OpenAI/Azure OpenAI\n",
        "\n",
        "Covers:\n",
        "\n",
        "*   /chat/completion\n",
        "*   /embedding\n",
        "\n",
        "\n",
        "These are **selected examples**. LiteLLM Proxy is **OpenAI-Compatible**, it works with any project that calls OpenAI. Just change the `base_url`, `api_key` and `model`.\n",
        "\n",
        "For more examples, [go here](https://docs.litellm.ai/docs/proxy/user_keys)\n",
        "\n",
        "To pass provider-specific args, [go here](https://docs.litellm.ai/docs/completion/provider_specific_params#proxy-usage)\n",
        "\n",
        "To drop unsupported params (E.g. frequency_penalty for bedrock with librechat), [go here](https://docs.litellm.ai/docs/completion/drop_params#openai-proxy-usage)\n"
      ],
      "metadata": {
        "id": "kccfk0mHZ4Ad"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "## /chat/completion\n",
        "\n"
      ],
      "metadata": {
        "id": "nmSClzCPaGH6"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "### OpenAI Python SDK"
      ],
      "metadata": {
        "id": "_vqcjwOVaKpO"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "x1e_Ok3KZzeP"
      },
      "outputs": [],
      "source": [
        "import openai\n",
        "client = openai.OpenAI(\n",
        "    api_key=\"anything\",\n",
        "    base_url=\"http://0.0.0.0:4000\"\n",
        ")\n",
        "\n",
        "# request sent to model set on litellm proxy, `litellm --model`\n",
        "response = client.chat.completions.create(\n",
        "    model=\"gpt-3.5-turbo\",\n",
        "    messages = [\n",
        "        {\n",
        "            \"role\": \"user\",\n",
        "            \"content\": \"this is a test request, write a short poem\"\n",
        "        }\n",
        "    ],\n",
        "    extra_body={ # pass in any provider-specific param, if not supported by openai, https://docs.litellm.ai/docs/completion/input#provider-specific-params\n",
        "        \"metadata\": { # 👈 use for logging additional params (e.g. to langfuse)\n",
        "            \"generation_name\": \"ishaan-generation-openai-client\",\n",
        "            \"generation_id\": \"openai-client-gen-id22\",\n",
        "            \"trace_id\": \"openai-client-trace-id22\",\n",
        "            \"trace_user_id\": \"openai-client-user-id2\"\n",
        "        }\n",
        "    }\n",
        ")\n",
        "\n",
        "print(response)"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Function Calling"
      ],
      "metadata": {
        "id": "AqkyKk9Scxgj"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from openai import OpenAI\n",
        "client = OpenAI(\n",
        "    api_key=\"sk-1234\", # [OPTIONAL] set if you set one on proxy, else set \"\"\n",
        "    base_url=\"http://0.0.0.0:4000\",\n",
        ")\n",
        "\n",
        "tools = [\n",
        "  {\n",
        "    \"type\": \"function\",\n",
        "    \"function\": {\n",
        "      \"name\": \"get_current_weather\",\n",
        "      \"description\": \"Get the current weather in a given location\",\n",
        "      \"parameters\": {\n",
        "        \"type\": \"object\",\n",
        "        \"properties\": {\n",
        "          \"location\": {\n",
        "            \"type\": \"string\",\n",
        "            \"description\": \"The city and state, e.g. San Francisco, CA\",\n",
        "          },\n",
        "          \"unit\": {\"type\": \"string\", \"enum\": [\"celsius\", \"fahrenheit\"]},\n",
        "        },\n",
        "        \"required\": [\"location\"],\n",
        "      },\n",
        "    }\n",
        "  }\n",
        "]\n",
        "messages = [{\"role\": \"user\", \"content\": \"What's the weather like in Boston today?\"}]\n",
        "completion = client.chat.completions.create(\n",
        "  model=\"gpt-4o\", # use 'model_name' from config.yaml\n",
        "  messages=messages,\n",
        "  tools=tools,\n",
        "  tool_choice=\"auto\"\n",
        ")\n",
        "\n",
        "print(completion)\n"
      ],
      "metadata": {
        "id": "wDg10VqLczE1"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Azure OpenAI Python SDK"
      ],
      "metadata": {
        "id": "YYoxLloSaNWW"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import openai\n",
        "client = openai.AzureOpenAI(\n",
        "    api_key=\"anything\",\n",
        "    base_url=\"http://0.0.0.0:4000\"\n",
        ")\n",
        "\n",
        "# request sent to model set on litellm proxy, `litellm --model`\n",
        "response = client.chat.completions.create(\n",
        "    model=\"gpt-3.5-turbo\",\n",
        "    messages = [\n",
        "        {\n",
        "            \"role\": \"user\",\n",
        "            \"content\": \"this is a test request, write a short poem\"\n",
        "        }\n",
        "    ],\n",
        "    extra_body={ # pass in any provider-specific param, if not supported by openai, https://docs.litellm.ai/docs/completion/input#provider-specific-params\n",
        "        \"metadata\": { # 👈 use for logging additional params (e.g. to langfuse)\n",
        "            \"generation_name\": \"ishaan-generation-openai-client\",\n",
        "            \"generation_id\": \"openai-client-gen-id22\",\n",
        "            \"trace_id\": \"openai-client-trace-id22\",\n",
        "            \"trace_user_id\": \"openai-client-user-id2\"\n",
        "        }\n",
        "    }\n",
        ")\n",
        "\n",
        "print(response)"
      ],
      "metadata": {
        "id": "yA1XcgowaSRy"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Langchain Python"
      ],
      "metadata": {
        "id": "yl9qhDvnaTpL"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from langchain.chat_models import ChatOpenAI\n",
        "from langchain.prompts.chat import (\n",
        "    ChatPromptTemplate,\n",
        "    HumanMessagePromptTemplate,\n",
        "    SystemMessagePromptTemplate,\n",
        ")\n",
        "from langchain.schema import HumanMessage, SystemMessage\n",
        "import os\n",
        "\n",
        "os.environ[\"OPENAI_API_KEY\"] = \"anything\"\n",
        "\n",
        "chat = ChatOpenAI(\n",
        "    openai_api_base=\"http://0.0.0.0:4000\",\n",
        "    model = \"gpt-3.5-turbo\",\n",
        "    temperature=0.1,\n",
        "    extra_body={\n",
        "        \"metadata\": {\n",
        "            \"generation_name\": \"ishaan-generation-langchain-client\",\n",
        "            \"generation_id\": \"langchain-client-gen-id22\",\n",
        "            \"trace_id\": \"langchain-client-trace-id22\",\n",
        "            \"trace_user_id\": \"langchain-client-user-id2\"\n",
        "        }\n",
        "    }\n",
        ")\n",
        "\n",
        "messages = [\n",
        "    SystemMessage(\n",
        "        content=\"You are a helpful assistant that im using to make a test request to.\"\n",
        "    ),\n",
        "    HumanMessage(\n",
        "        content=\"test from litellm. tell me why it's amazing in 1 sentence\"\n",
        "    ),\n",
        "]\n",
        "response = chat(messages)\n",
        "\n",
        "print(response)"
      ],
      "metadata": {
        "id": "5MUZgSquaW5t"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Curl"
      ],
      "metadata": {
        "id": "B9eMgnULbRaz"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "\n",
        "\n",
        "```\n",
        "curl -X POST 'http://0.0.0.0:4000/chat/completions' \\\n",
        "    -H 'Content-Type: application/json' \\\n",
        "    -d '{\n",
        "    \"model\": \"gpt-3.5-turbo\",\n",
        "    \"messages\": [\n",
        "        {\n",
        "        \"role\": \"user\",\n",
        "        \"content\": \"what llm are you\"\n",
        "        }\n",
        "    ],\n",
        "    \"metadata\": {\n",
        "        \"generation_name\": \"ishaan-test-generation\",\n",
        "        \"generation_id\": \"gen-id22\",\n",
        "        \"trace_id\": \"trace-id22\",\n",
        "        \"trace_user_id\": \"user-id2\"\n",
        "    }\n",
        "}'\n",
        "```\n",
        "\n"
      ],
      "metadata": {
        "id": "VWCCk5PFcmhS"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "### LlamaIndex"
      ],
      "metadata": {
        "id": "drBAm2e1b6xe"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import os, dotenv\n",
        "\n",
        "from llama_index.llms import AzureOpenAI\n",
        "from llama_index.embeddings import AzureOpenAIEmbedding\n",
        "from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext\n",
        "\n",
        "llm = AzureOpenAI(\n",
        "    engine=\"azure-gpt-3.5\",               # model_name on litellm proxy\n",
        "    temperature=0.0,\n",
        "    azure_endpoint=\"http://0.0.0.0:4000\", # litellm proxy endpoint\n",
        "    api_key=\"sk-1234\",                    # litellm proxy API Key\n",
        "    api_version=\"2023-07-01-preview\",\n",
        ")\n",
        "\n",
        "embed_model = AzureOpenAIEmbedding(\n",
        "    deployment_name=\"azure-embedding-model\",\n",
        "    azure_endpoint=\"http://0.0.0.0:4000\",\n",
        "    api_key=\"sk-1234\",\n",
        "    api_version=\"2023-07-01-preview\",\n",
        ")\n",
        "\n",
        "\n",
        "documents = SimpleDirectoryReader(\"llama_index_data\").load_data()\n",
        "service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)\n",
        "index = VectorStoreIndex.from_documents(documents, service_context=service_context)\n",
        "\n",
        "query_engine = index.as_query_engine()\n",
        "response = query_engine.query(\"What did the author do growing up?\")\n",
        "print(response)\n"
      ],
      "metadata": {
        "id": "d0bZcv8fb9mL"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Langchain JS"
      ],
      "metadata": {
        "id": "xypvNdHnb-Yy"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import { ChatOpenAI } from \"@langchain/openai\";\n",
        "\n",
        "\n",
        "const model = new ChatOpenAI({\n",
        "  modelName: \"gpt-4\",\n",
        "  openAIApiKey: \"sk-1234\",\n",
        "  modelKwargs: {\"metadata\": \"hello world\"} // 👈 PASS Additional params here\n",
        "}, {\n",
        "  basePath: \"http://0.0.0.0:4000\",\n",
        "});\n",
        "\n",
        "const message = await model.invoke(\"Hi there!\");\n",
        "\n",
        "console.log(message);\n"
      ],
      "metadata": {
        "id": "R55mK2vCcBN2"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "### OpenAI JS"
      ],
      "metadata": {
        "id": "nC4bLifCcCiW"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "const { OpenAI } = require('openai');\n",
        "\n",
        "const openai = new OpenAI({\n",
        "  apiKey: \"sk-1234\", // This is the default and can be omitted\n",
        "  baseURL: \"http://0.0.0.0:4000\"\n",
        "});\n",
        "\n",
        "async function main() {\n",
        "  const chatCompletion = await openai.chat.completions.create({\n",
        "    messages: [{ role: 'user', content: 'Say this is a test' }],\n",
        "    model: 'gpt-3.5-turbo',\n",
        "  }, {\"metadata\": {\n",
        "            \"generation_name\": \"ishaan-generation-openaijs-client\",\n",
        "            \"generation_id\": \"openaijs-client-gen-id22\",\n",
        "            \"trace_id\": \"openaijs-client-trace-id22\",\n",
        "            \"trace_user_id\": \"openaijs-client-user-id2\"\n",
        "        }});\n",
        "}\n",
        "\n",
        "main();\n"
      ],
      "metadata": {
        "id": "MICH8kIMcFpg"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Anthropic SDK"
      ],
      "metadata": {
        "id": "D1Q07pEAcGTb"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import os\n",
        "\n",
        "from anthropic import Anthropic\n",
        "\n",
        "client = Anthropic(\n",
        "    base_url=\"http://localhost:4000\", # proxy endpoint\n",
        "    api_key=\"sk-s4xN1IiLTCytwtZFJaYQrA\", # litellm proxy virtual key\n",
        ")\n",
        "\n",
        "message = client.messages.create(\n",
        "    max_tokens=1024,\n",
        "    messages=[\n",
        "        {\n",
        "            \"role\": \"user\",\n",
        "            \"content\": \"Hello, Claude\",\n",
        "        }\n",
        "    ],\n",
        "    model=\"claude-3-opus-20240229\",\n",
        ")\n",
        "print(message.content)"
      ],
      "metadata": {
        "id": "qBjFcAvgcI3t"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## /embeddings"
      ],
      "metadata": {
        "id": "dFAR4AJGcONI"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "### OpenAI Python SDK"
      ],
      "metadata": {
        "id": "lgNoM281cRzR"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import openai\n",
        "from openai import OpenAI\n",
        "\n",
        "# set base_url to your proxy server\n",
        "# set api_key to send to proxy server\n",
        "client = OpenAI(api_key=\"<proxy-api-key>\", base_url=\"http://0.0.0.0:4000\")\n",
        "\n",
        "response = client.embeddings.create(\n",
        "    input=[\"hello from litellm\"],\n",
        "    model=\"text-embedding-ada-002\"\n",
        ")\n",
        "\n",
        "print(response)\n"
      ],
      "metadata": {
        "id": "NY3DJhPfcQhA"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Langchain Embeddings"
      ],
      "metadata": {
        "id": "hmbg-DW6cUZs"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from langchain.embeddings import OpenAIEmbeddings\n",
        "\n",
        "embeddings = OpenAIEmbeddings(model=\"sagemaker-embeddings\", openai_api_base=\"http://0.0.0.0:4000\", openai_api_key=\"temp-key\")\n",
        "\n",
        "\n",
        "text = \"This is a test document.\"\n",
        "\n",
        "query_result = embeddings.embed_query(text)\n",
        "\n",
        "print(f\"SAGEMAKER EMBEDDINGS\")\n",
        "print(query_result[:5])\n",
        "\n",
        "embeddings = OpenAIEmbeddings(model=\"bedrock-embeddings\", openai_api_base=\"http://0.0.0.0:4000\", openai_api_key=\"temp-key\")\n",
        "\n",
        "text = \"This is a test document.\"\n",
        "\n",
        "query_result = embeddings.embed_query(text)\n",
        "\n",
        "print(f\"BEDROCK EMBEDDINGS\")\n",
        "print(query_result[:5])\n",
        "\n",
        "embeddings = OpenAIEmbeddings(model=\"bedrock-titan-embeddings\", openai_api_base=\"http://0.0.0.0:4000\", openai_api_key=\"temp-key\")\n",
        "\n",
        "text = \"This is a test document.\"\n",
        "\n",
        "query_result = embeddings.embed_query(text)\n",
        "\n",
        "print(f\"TITAN EMBEDDINGS\")\n",
        "print(query_result[:5])"
      ],
      "metadata": {
        "id": "lX2S8Nl1cWVP"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Curl Request"
      ],
      "metadata": {
        "id": "oqGbWBCQcYfd"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "\n",
        "\n",
        "```curl\n",
        "curl -X POST 'http://0.0.0.0:4000/embeddings' \\\n",
        "  -H 'Content-Type: application/json' \\\n",
        "  -d ' {\n",
        "  \"model\": \"text-embedding-ada-002\",\n",
        "  \"input\": [\"write a litellm poem\"]\n",
        "  }'\n",
        "```\n",
        "\n"
      ],
      "metadata": {
        "id": "7rkIMV9LcdwQ"
      }
    }
  ]
 }
--- a/cookbook/litellm_router/error_log.txt
+++ b/cookbook/litellm_router/error_log.txt
@ -1,10 +1,10 @@
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
 Exception: Expecting value: line 1 column 1 (char 0)
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -21,13 +21,13 @@ Exception: Expecting value: line 1 column 1 (char 0)
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: Expecting value: line 1 column 1 (char 0)
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
 Exception: Expecting value: line 1 column 1 (char 0)
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -49,7 +49,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: Expecting value: line 1 column 1 (char 0)
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -61,7 +61,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: Expecting value: line 1 column 1 (char 0)
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -70,7 +70,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: Expecting value: line 1 column 1 (char 0)
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -79,7 +79,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: Expecting value: line 1 column 1 (char 0)
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -109,7 +109,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: Expecting value: line 1 column 1 (char 0)
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -128,7 +128,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: Expecting value: line 1 column 1 (char 0)
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -148,7 +148,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: Expecting value: line 1 column 1 (char 0)
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -162,7 +162,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: Expecting value: line 1 column 1 (char 0)
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -174,7 +174,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: Expecting value: line 1 column 1 (char 0)
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -184,7 +184,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: Expecting value: line 1 column 1 (char 0)
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -193,19 +193,19 @@ Exception: Expecting value: line 1 column 1 (char 0)
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: Expecting value: line 1 column 1 (char 0)
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
 Exception: Expecting value: line 1 column 1 (char 0)
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
 Exception: Expecting value: line 1 column 1 (char 0)
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -214,7 +214,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: Expecting value: line 1 column 1 (char 0)
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -234,7 +234,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: Expecting value: line 1 column 1 (char 0)
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -244,7 +244,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: Expecting value: line 1 column 1 (char 0)
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -253,7 +253,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: Expecting value: line 1 column 1 (char 0)
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -267,31 +267,31 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: Expecting value: line 1 column 1 (char 0)
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
 Exception: Expecting value: line 1 column 1 (char 0)
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
 Exception: Expecting value: line 1 column 1 (char 0)
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
 Exception: Expecting value: line 1 column 1 (char 0)
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
 Exception: Expecting value: line 1 column 1 (char 0)
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -305,7 +305,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: Expecting value: line 1 column 1 (char 0)
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -330,7 +330,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: Expecting value: line 1 column 1 (char 0)
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -339,7 +339,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: Expecting value: line 1 column 1 (char 0)
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -360,7 +360,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: Expecting value: line 1 column 1 (char 0)
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -369,7 +369,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: Expecting value: line 1 column 1 (char 0)
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -378,7 +378,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: Expecting value: line 1 column 1 (char 0)
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -388,7 +388,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: Expecting value: line 1 column 1 (char 0)
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -409,7 +409,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: Expecting value: line 1 column 1 (char 0)
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -422,13 +422,13 @@ Exception: Expecting value: line 1 column 1 (char 0)
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: Expecting value: line 1 column 1 (char 0)
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
 Exception: Expecting value: line 1 column 1 (char 0)
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -438,7 +438,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: Expecting value: line 1 column 1 (char 0)
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -462,7 +462,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -482,7 +482,7 @@ Exception: 'Response' object has no attribute 'get'
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -492,7 +492,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -516,7 +516,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -529,7 +529,7 @@ Exception: 'Response' object has no attribute 'get'
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -546,13 +546,13 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -580,13 +580,13 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -624,7 +624,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -638,13 +638,13 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -660,7 +660,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -681,7 +681,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -691,31 +691,31 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -771,7 +771,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -780,7 +780,7 @@ Exception: 'Response' object has no attribute 'get'
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -800,7 +800,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -820,7 +820,7 @@ Exception: 'Response' object has no attribute 'get'
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -830,7 +830,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -840,7 +840,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -850,7 +850,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -862,13 +862,13 @@ Exception: 'Response' object has no attribute 'get'
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -877,7 +877,7 @@ Exception: 'Response' object has no attribute 'get'
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -898,7 +898,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -919,7 +919,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -936,19 +936,19 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -961,25 +961,25 @@ Exception: 'Response' object has no attribute 'get'
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -993,7 +993,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: 'Response' object has no attribute 'get'
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
--- a/cookbook/litellm_router/request_log.txt
+++ b/cookbook/litellm_router/request_log.txt
@ -20,7 +20,7 @@ Call all LLM APIs using the OpenAI format.
 Response ID: 52dbbd49-eedb-4c11-8382-3ca7deb1af35 Url: /queue/response/52dbbd49-eedb-4c11-8382-3ca7deb1af35
 Time: 3.50 seconds
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
@ -35,7 +35,7 @@ Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. C
 Response ID: ae1e2b71-d711-456d-8df0-13ce0709eb04 Url: /queue/response/ae1e2b71-d711-456d-8df0-13ce0709eb04
 Time: 5.60 seconds
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 10
--- a/cookbook/litellm_router/test_questions/question3.txt
+++ b/cookbook/litellm_router/test_questions/question3.txt
@ -1,4 +1,4 @@
-What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:
 Calling 100+ LLMs Huggingface/Bedrock/TogetherAI/etc. in the OpenAI ChatCompletions & Completions format
--- a/deploy/charts/litellm-helm/Chart.yaml
+++ b/deploy/charts/litellm-helm/Chart.yaml
@ -18,13 +18,13 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.2.1
+version: 0.2.3
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
 # follow Semantic Versioning. They should reflect the version the application is using.
 # It is recommended to use it with quotes.
-appVersion: v1.41.8
+appVersion: v1.43.18
 dependencies:
  - name: "postgresql"
--- a/deploy/charts/litellm-helm/README.md
+++ b/deploy/charts/litellm-helm/README.md
@ -1,5 +1,9 @@
 # Helm Chart for LiteLLM
 > [!IMPORTANT]
 > This is community maintained, Please make an issue if you run into a bug
 > We recommend using [Docker or Kubernetes for production deployments](https://docs.litellm.ai/docs/proxy/prod)
 ## Prerequisites
 - Kubernetes 1.21+
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -9,13 +9,11 @@ services:
    #########################################
    ## Uncomment these lines to start proxy with a config.yaml file ##
    # volumes:
    #   - ./proxy_server_config.yaml:/app/config.yaml
    # command: [ "--config", "./config.yaml", "--port", "4000"]
    ###############################################
    ports:
      - "4000:4000" # Map the container port to the host, change the host port if necessary
    environment:
-        DATABASE_URL: "postgresql://postgres:example@db:5432/postgres"
+        DATABASE_URL: "postgresql://llmproxy:dbpassword9090@db:5432/litellm"
        STORE_MODEL_IN_DB: "True" # allows adding models to proxy via UI
    env_file:
      - .env # Load local .env file
@ -25,11 +23,31 @@ services:
    image: postgres
    restart: always
    environment:
-      POSTGRES_PASSWORD: example
+      POSTGRES_DB: litellm
      POSTGRES_USER: llmproxy
      POSTGRES_PASSWORD: dbpassword9090
    healthcheck:
-      test: ["CMD-SHELL", "pg_isready"]
+      test: ["CMD-SHELL", "pg_isready -d litellm -U llmproxy"]
      interval: 1s
      timeout: 5s
      retries: 10
  prometheus:
    image: prom/prometheus
    volumes:
      - prometheus_data:/prometheus
      - ./prometheus.yml:/etc/prometheus/prometheus.yml
    ports:
      - "9090:9090"
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--storage.tsdb.retention.time=15d'
    restart: always
 volumes:
  prometheus_data:
    driver: local
 # ...rest of your docker-compose config if any
--- a/docs/my-website/docs/batches.md
+++ b/docs/my-website/docs/batches.md
@ -1,23 +1,73 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
-# Batches API
+# [BETA] Batches API
 Covers Batches, Files
 ## Quick Start 
 Call an existing Assistant. 
 - Create File for Batch Completion
 - Create Batch Request
 - List Batches
 - Retrieve the Specific Batch and File Content
 <Tabs>
 <TabItem value="proxy" label="LiteLLM PROXY Server">
 ```bash
 $ export OPENAI_API_KEY="sk-..."
 $ litellm
 # RUNNING on http://0.0.0.0:4000
 ```
 **Create File for Batch Completion**
 ```shell
 curl http://localhost:4000/v1/files \
    -H "Authorization: Bearer sk-1234" \
    -F purpose="batch" \
    -F file="@mydata.jsonl"
 ```
 **Create Batch Request**
 ```bash
 curl http://localhost:4000/v1/batches \
        -H "Authorization: Bearer sk-1234" \
        -H "Content-Type: application/json" \
        -d '{
            "input_file_id": "file-abc123",
            "endpoint": "/v1/chat/completions",
            "completion_window": "24h"
    }'
 ```
 **Retrieve the Specific Batch**
 ```bash
 curl http://localhost:4000/v1/batches/batch_abc123 \
    -H "Authorization: Bearer sk-1234" \
    -H "Content-Type: application/json" \
 ```
 **List Batches**
 ```bash
 curl http://localhost:4000/v1/batches \
    -H "Authorization: Bearer sk-1234" \
    -H "Content-Type: application/json" \
 ```
 </TabItem>
 <TabItem value="sdk" label="SDK">
 **Create File for Batch Completion**
@ -77,48 +127,15 @@ file_content = await litellm.afile_content(
 print("file content = ", file_content)
 ```
-</TabItem>
+**List Batches**
 <TabItem value="proxy" label="PROXY">
-```bash
+```python
-$ export OPENAI_API_KEY="sk-..."
+list_batches_response = litellm.list_batches(custom_llm_provider="openai", limit=2)
-
+print("list_batches_response=", list_batches_response)
 $ litellm
 # RUNNING on http://0.0.0.0:4000
 ```
 **Create File for Batch Completion**
 ```shell
 curl https://api.openai.com/v1/files \
    -H "Authorization: Bearer sk-1234" \
    -F purpose="batch" \
    -F file="@mydata.jsonl"
 ```
 **Create Batch Request**
 ```bash
 curl http://localhost:4000/v1/batches \
        -H "Authorization: Bearer sk-1234" \
        -H "Content-Type: application/json" \
        -d '{
            "input_file_id": "file-abc123",
            "endpoint": "/v1/chat/completions",
            "completion_window": "24h"
    }'
 ```
 **Retrieve the Specific Batch**
 ```bash
 curl http://localhost:4000/v1/batches/batch_abc123 \
    -H "Authorization: Bearer sk-1234" \
    -H "Content-Type: application/json" \
 ```
 </TabItem>
 </Tabs>
 ## [👉 Proxy API Reference](https://litellm-api.up.railway.app/#/batch)
--- a/docs/my-website/docs/budget_manager.md
+++ b/docs/my-website/docs/budget_manager.md
@ -7,14 +7,14 @@ Don't want to get crazy bills because either while you're calling LLM APIs **or*
 :::info
-If you want a server to manage user keys, budgets, etc. use our [OpenAI Proxy Server](./proxy/virtual_keys.md)
+If you want a server to manage user keys, budgets, etc. use our [LiteLLM Proxy Server](./proxy/virtual_keys.md)
 :::
 LiteLLM exposes: 
 * `litellm.max_budget`: a global variable you can use to set the max budget (in USD) across all your litellm calls. If this budget is exceeded, it will raise a BudgetExceededError 
 * `BudgetManager`: A class to help set budgets per user. BudgetManager creates a dictionary to manage the user budgets, where the key is user and the object is their current cost + model-specific costs. 
-* `OpenAI Proxy Server`: A server to call 100+ LLMs with an openai-compatible endpoint. Manages user budgets, spend tracking, load balancing etc. 
+* `LiteLLM Proxy Server`: A server to call 100+ LLMs with an openai-compatible endpoint. Manages user budgets, spend tracking, load balancing etc. 
 ## quick start
--- a/docs/my-website/docs/completion/input.md
+++ b/docs/my-website/docs/completion/input.md
@ -48,19 +48,20 @@ Use `litellm.get_supported_openai_params()` for an updated list of params for ea
 |Anyscale | ✅ | ✅ | ✅ | ✅ | ✅ |
 |Cohere| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |   |   |
 |Huggingface| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |  |   |    |
-|Openrouter| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | ✅ | | | | |
+|Openrouter| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | ✅ |✅ | | | |
 |AI21| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |  |  |
 |VertexAI| ✅ | ✅ |  | ✅ | ✅ |  |  |  |  |   | | | | ✅ | ✅ | | |
-|Bedrock| ✅ | ✅ | ✅ | ✅ | ✅ |  |  |   |  |   | | | | | ✅ (for anthropic) | |
+|Bedrock| ✅ | ✅ | ✅ | ✅ | ✅ |  |  |   |  |   | | | | | ✅ (model dependent) | |
 |Sagemaker| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |  |  |   |
 |TogetherAI| ✅ | ✅ | ✅ | ✅ | ✅ |  |  |   |  |   | ✅ |
 |AlephAlpha| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |   |  |   |
 |Palm| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |  |  |   |
 |NLP Cloud| ✅ | ✅ | ✅ | ✅ | ✅ | | |  |  |   |
 |Petals| ✅ | ✅ |  | ✅ | ✅ | |  |   |  |   |
-|Ollama| ✅ | ✅ | ✅ | ✅ | ✅ |  |   | ✅ |  |   | | | ✅ | | |
+|Ollama| ✅ | ✅ | ✅ | ✅ | ✅ |  |   | ✅ |  |   | | | ✅ |  | |✅| | | | | | |
 |Databricks| ✅ | ✅ | ✅ | ✅ | ✅ |  |   | |  |   | | | | | |
 |ClarifAI| ✅ | ✅ | |✅ | ✅ |  |   | |  |   | | | | | |
 |Github| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | ✅ |✅ (model dependent)|✅ (model dependent)| | |
 :::note
 By default, LiteLLM raises an exception if the openai param being passed in isn't supported. 
--- a/docs/my-website/docs/completion/json_mode.md
+++ b/docs/my-website/docs/completion/json_mode.md
@ -0,0 +1,321 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # Structured Outputs (JSON Mode)
 ## Quick Start 
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 from litellm import completion
 import os 
 os.environ["OPENAI_API_KEY"] = ""
 response = completion(
  model="gpt-4o-mini",
  response_format={ "type": "json_object" },
  messages=[
    {"role": "system", "content": "You are a helpful assistant designed to output JSON."},
    {"role": "user", "content": "Who won the world series in 2020?"}
  ]
 )
 print(response.choices[0].message.content)
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 ```bash
 curl http://0.0.0.0:4000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer $LITELLM_KEY" \
  -d '{
    "model": "gpt-4o-mini",
    "response_format": { "type": "json_object" },
    "messages": [
      {
        "role": "system",
        "content": "You are a helpful assistant designed to output JSON."
      },
      {
        "role": "user",
        "content": "Who won the world series in 2020?"
      }
    ]
  }'
 ```
 </TabItem>
 </Tabs>
 ## Check Model Support 
 Call `litellm.get_supported_openai_params` to check if a model/provider supports `response_format`. 
 ```python
 from litellm import get_supported_openai_params
 params = get_supported_openai_params(model="anthropic.claude-3", custom_llm_provider="bedrock")
 assert "response_format" in params
 ```
 ## Pass in 'json_schema' 
 To use Structured Outputs, simply specify
 ```
 response_format: { "type": "json_schema", "json_schema": … , "strict": true }
 ```
 Works for:
 - OpenAI models 
 - Azure OpenAI models
 - Google AI Studio - Gemini models
 - Vertex AI models (Gemini + Anthropic)
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 import os
 from litellm import completion 
 # add to env var 
 os.environ["OPENAI_API_KEY"] = ""
 messages = [{"role": "user", "content": "List 5 important events in the XIX century"}]
 class CalendarEvent(BaseModel):
  name: str
  date: str
  participants: list[str]
 class EventsList(BaseModel):
    events: list[CalendarEvent]
 resp = completion(
    model="gpt-4o-2024-08-06",
    messages=messages,
    response_format=EventsList
 )
 print("Received={}".format(resp))
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 1. Add openai model to config.yaml
 ```yaml
 model_list:
  - model_name: "gpt-4o"
    litellm_params:
      model: "gpt-4o-2024-08-06"
 ```
 2. Start proxy with config.yaml
 ```bash
 litellm --config /path/to/config.yaml
 ```
 3. Call with OpenAI SDK / Curl!
 Just replace the 'base_url' in the openai sdk, to call the proxy with 'json_schema' for openai models
 **OpenAI SDK**
 ```python
 from pydantic import BaseModel
 from openai import OpenAI
 client = OpenAI(
    api_key="anything", # 👈 PROXY KEY (can be anything, if master_key not set)
    base_url="http://0.0.0.0:4000" # 👈 PROXY BASE URL
 )
 class Step(BaseModel):
    explanation: str
    output: str
 class MathReasoning(BaseModel):
    steps: list[Step]
    final_answer: str
 completion = client.beta.chat.completions.parse(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": "You are a helpful math tutor. Guide the user through the solution step by step."},
        {"role": "user", "content": "how can I solve 8x + 7 = -23"}
    ],
    response_format=MathReasoning,
 )
 math_reasoning = completion.choices[0].message.parsed
 ```
 **Curl**
 ```bash
 curl -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
 -H 'Content-Type: application/json' \
 -H 'Authorization: Bearer sk-1234' \
 -d '{
    "model": "gpt-4o",
    "messages": [
      {
        "role": "system",
        "content": "You are a helpful math tutor. Guide the user through the solution step by step."
      },
      {
        "role": "user",
        "content": "how can I solve 8x + 7 = -23"
      }
    ],
    "response_format": {
      "type": "json_schema",
      "json_schema": {
        "name": "math_reasoning",
        "schema": {
          "type": "object",
          "properties": {
            "steps": {
              "type": "array",
              "items": {
                "type": "object",
                "properties": {
                  "explanation": { "type": "string" },
                  "output": { "type": "string" }
                },
                "required": ["explanation", "output"],
                "additionalProperties": false
              }
            },
            "final_answer": { "type": "string" }
          },
          "required": ["steps", "final_answer"],
          "additionalProperties": false
        },
        "strict": true
      }
    }
  }'
 ```
 </TabItem>
 </Tabs>
 ## Validate JSON Schema 
 Not all vertex models support passing the json_schema to them (e.g. `gemini-1.5-flash`). To solve this, LiteLLM supports client-side validation of the json schema. 
 ```
 litellm.enable_json_schema_validation=True
 ```
 If `litellm.enable_json_schema_validation=True` is set, LiteLLM will validate the json response using `jsonvalidator`. 
 [**See Code**](https://github.com/BerriAI/litellm/blob/671d8ac496b6229970c7f2a3bdedd6cb84f0746b/litellm/litellm_core_utils/json_validation_rule.py#L4)
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 # !gcloud auth application-default login - run this to add vertex credentials to your env
 import litellm, os
 from litellm import completion 
 from pydantic import BaseModel 
 messages=[
        {"role": "system", "content": "Extract the event information."},
        {"role": "user", "content": "Alice and Bob are going to a science fair on Friday."},
    ]
 litellm.enable_json_schema_validation = True
 litellm.set_verbose = True # see the raw request made by litellm
 class CalendarEvent(BaseModel):
  name: str
  date: str
  participants: list[str]
 resp = completion(
    model="gemini/gemini-1.5-pro",
    messages=messages,
    response_format=CalendarEvent,
 )
 print("Received={}".format(resp))
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 1. Create config.yaml
 ```yaml
 model_list:
  - model_name: "gemini-1.5-flash"
    litellm_params:
      model: "gemini/gemini-1.5-flash"
      api_key: os.environ/GEMINI_API_KEY
 litellm_settings:
  enable_json_schema_validation: True
 ```
 2. Start proxy 
 ```bash
 litellm --config /path/to/config.yaml
 ```
 3. Test it! 
 ```bash
 curl http://0.0.0.0:4000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer $LITELLM_API_KEY" \
  -d '{
    "model": "gemini-1.5-flash",
    "messages": [
        {"role": "system", "content": "Extract the event information."},
        {"role": "user", "content": "Alice and Bob are going to a science fair on Friday."},
    ],
    "response_format": { 
        "type": "json_object",
        "response_schema": { 
            "type": "json_schema",
            "json_schema": {
              "name": "math_reasoning",
              "schema": {
                "type": "object",
                "properties": {
                  "steps": {
                    "type": "array",
                    "items": {
                      "type": "object",
                      "properties": {
                        "explanation": { "type": "string" },
                        "output": { "type": "string" }
                      },
                      "required": ["explanation", "output"],
                      "additionalProperties": false
                    }
                  },
                  "final_answer": { "type": "string" }
                },
                "required": ["steps", "final_answer"],
                "additionalProperties": false
              },
              "strict": true
            },
        }
    },
  }'
 ```
 </TabItem>
 </Tabs>
--- a/docs/my-website/docs/completion/prefix.md
+++ b/docs/my-website/docs/completion/prefix.md
@ -0,0 +1,119 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # Pre-fix Assistant Messages
 Supported by:
 - Deepseek
 - Mistral
 - Anthropic
 ```python
 {
  "role": "assistant", 
  "content": "..", 
  ...
  "prefix": true # 👈 KEY CHANGE
 }
 ```
 ## Quick Start 
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 from litellm import completion
 import os 
 os.environ["DEEPSEEK_API_KEY"] = ""
 response = completion(
  model="deepseek/deepseek-chat",
  messages=[
    {"role": "user", "content": "Who won the world cup in 2022?"},
    {"role": "assistant", "content": "Argentina", "prefix": True}
  ]
 )
 print(response.choices[0].message.content)
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 ```bash
 curl http://0.0.0.0:4000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer $LITELLM_KEY" \
  -d '{
    "model": "deepseek/deepseek-chat",
    "messages": [
      {
        "role": "user",
        "content": "Who won the world cup in 2022?"
      },
      {
        "role": "assistant", 
        "content": "Argentina", "prefix": true
      }
    ]
 }'
 ```
 </TabItem>
 </Tabs>
 **Expected Response**
 ```bash
 {
    "id": "3b66124d79a708e10c603496b363574c",
    "choices": [
        {
            "finish_reason": "stop",
            "index": 0,
            "message": {
                "content": " won the FIFA World Cup in 2022.",
                "role": "assistant",
                "tool_calls": null,
                "function_call": null
            }
        }
    ],
    "created": 1723323084,
    "model": "deepseek/deepseek-chat",
    "object": "chat.completion",
    "system_fingerprint": "fp_7e0991cad4",
    "usage": {
        "completion_tokens": 12,
        "prompt_tokens": 16,
        "total_tokens": 28,
    },
    "service_tier": null
 }
 ```
 ## Check Model Support 
 Call `litellm.get_model_info` to check if a model/provider supports `response_format`. 
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 from litellm import get_model_info
 params = get_model_info(model="deepseek/deepseek-chat")
 assert params["supports_assistant_prefill"] is True
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 Call the `/model/info` endpoint to get a list of models + their supported params.
 ```bash
 curl -X GET 'http://0.0.0.0:4000/v1/model/info' \
 -H 'Authorization: Bearer $LITELLM_KEY' \
 ```
 </TabItem>
 </Tabs>
--- a/docs/my-website/docs/completion/stream.md
+++ b/docs/my-website/docs/completion/stream.md
@ -1,3 +1,6 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # Streaming + Async
 - [Streaming Responses](#streaming-responses)
@ -74,3 +77,72 @@ async def completion_call():
 asyncio.run(completion_call())
 ```
 ## Error Handling - Infinite Loops
 Sometimes a model might enter an infinite loop, and keep repeating the same chunks - [e.g. issue](https://github.com/BerriAI/litellm/issues/5158)
 Break out of it with: 
 ```python
 litellm.REPEATED_STREAMING_CHUNK_LIMIT = 100 # # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.
 ```
 LiteLLM provides error handling for this, by checking if a chunk is repeated 'n' times (Default is 100). If it exceeds that limit, it will raise a `litellm.InternalServerError`, to allow retry logic to happen. 
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 import litellm 
 import os 
 litellm.set_verbose = False
 loop_amount = litellm.REPEATED_STREAMING_CHUNK_LIMIT + 1
 chunks = [
    litellm.ModelResponse(**{
    "id": "chatcmpl-123",
    "object": "chat.completion.chunk",
    "created": 1694268190,
    "model": "gpt-3.5-turbo-0125",
    "system_fingerprint": "fp_44709d6fcb",
    "choices": [
        {"index": 0, "delta": {"content": "How are you?"}, "finish_reason": "stop"}
    ],
 }, stream=True)
 ] * loop_amount
 completion_stream = litellm.ModelResponseListIterator(model_responses=chunks)
 response = litellm.CustomStreamWrapper(
    completion_stream=completion_stream,
    model="gpt-3.5-turbo",
    custom_llm_provider="cached_response",
    logging_obj=litellm.Logging(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": "Hey"}],
        stream=True,
        call_type="completion",
        start_time=time.time(),
        litellm_call_id="12345",
        function_id="1245",
    ),
 )
 for chunk in response:
    continue # expect to raise InternalServerError 
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 Define this on your config.yaml on the proxy. 
 ```yaml
 litellm_settings:
    REPEATED_STREAMING_CHUNK_LIMIT: 100 # this overrides the litellm default
 ```
 The proxy uses the litellm SDK. To validate this works, try the 'SDK' code snippet. 
 </TabItem>
 </Tabs>
--- a/docs/my-website/docs/data_security.md
+++ b/docs/my-website/docs/data_security.md
@ -14,6 +14,14 @@
 For security inquiries, please contact us at support@berri.ai
 ## Self-hosted Instances LiteLLM
 - ** No data or telemetry is stored on LiteLLM Servers when you self host **
 - For installation and configuration, see: [Self-hosting guided](../docs/proxy/deploy.md)
 - **Telemetry** We run no telemetry when you self host LiteLLM
 For security inquiries, please contact us at support@berri.ai
 ### Supported data regions for LiteLLM Cloud
 LiteLLM supports the following data regions:
--- a/docs/my-website/docs/embedding/supported_embedding.md
+++ b/docs/my-website/docs/embedding/supported_embedding.md
@ -270,7 +270,7 @@ response = embedding(
 | embed-multilingual-v2.0  | `embedding(model="embed-multilingual-v2.0", input=["good morning from litellm", "this is another item"])` |
 ## HuggingFace Embedding Models
-LiteLLM supports all Feature-Extraction Embedding models: https://huggingface.co/models?pipeline_tag=feature-extraction
+LiteLLM supports all Feature-Extraction + Sentence Similarity Embedding models: https://huggingface.co/models?pipeline_tag=feature-extraction
 ### Usage
 ```python
@ -282,6 +282,25 @@ response = embedding(
    input=["good morning from litellm"]
 )
 ```
 ### Usage - Set input_type
 LiteLLM infers input type (feature-extraction or sentence-similarity) by making a GET request to the api base. 
 Override this, by setting the `input_type` yourself.
 ```python
 from litellm import embedding
 import os
 os.environ['HUGGINGFACE_API_KEY'] = ""
 response = embedding(
    model='huggingface/microsoft/codebert-base', 
    input=["good morning from litellm", "you are a good bot"],
    api_base = "https://p69xlsj6rpno5drq.us-east-1.aws.endpoints.huggingface.cloud", 
    input_type="sentence-similarity"
 )
 ```
 ### Usage - Custom API Base
 ```python
 from litellm import embedding
--- a/docs/my-website/docs/enterprise.md
+++ b/docs/my-website/docs/enterprise.md
@ -27,11 +27,17 @@ This covers:
        - ✅ IP address‑based access control lists
        - ✅ Track Request IP Address
        - ✅ [Use LiteLLM keys/authentication on Pass Through Endpoints](./proxy/pass_through#✨-enterprise---use-litellm-keysauthentication-on-pass-through-endpoints)
        - ✅ Set Max Request / File Size on Requests
        - ✅ [Enforce Required Params for LLM Requests (ex. Reject requests missing ["metadata"]["generation_name"])](./proxy/enterprise#enforce-required-params-for-llm-requests)
-    - **Spend Tracking**
+    - **Customize Logging, Guardrails, Caching per project**
        - ✅ [Team Based Logging](./proxy/team_logging.md) - Allow each team to use their own Langfuse Project / custom callbacks
        - ✅ [Disable Logging for a Team](./proxy/team_logging.md#disable-logging-for-a-team) - Switch off all logging for a team/project (GDPR Compliance)
    - **Spend Tracking & Data Exports**
        - ✅ [Tracking Spend for Custom Tags](./proxy/enterprise#tracking-spend-for-custom-tags)
        - ✅ [Exporting LLM Logs to GCS Bucket](./proxy/bucket#🪣-logging-gcs-s3-buckets)
        - ✅ [API Endpoints to get Spend Reports per Team, API Key, Customer](./proxy/cost_tracking.md#✨-enterprise-api-endpoints-to-get-spend)
-    - **Advanced Metrics**
+    - **Prometheus Metrics**
        - ✅ [Prometheus Metrics - Num Requests, failures, LLM Provider Outages](./proxy/prometheus)
        - ✅ [`x-ratelimit-remaining-requests`, `x-ratelimit-remaining-tokens` for LLM APIs on Prometheus](./proxy/prometheus#✨-enterprise-llm-remaining-requests-and-remaining-tokens)
    - **Guardrails, PII Masking, Content Moderation**
        - ✅ [Content Moderation with LLM Guard, LlamaGuard, Secret Detection, Google Text Moderations](./proxy/enterprise#content-moderation)
--- a/docs/my-website/docs/fine_tuning.md
+++ b/docs/my-website/docs/fine_tuning.md
@ -0,0 +1,313 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # [Beta] Fine-tuning API
 :::info
 This is an Enterprise only endpoint [Get Started with Enterprise here](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
 :::
 ## Supported Providers
 - Azure OpenAI
 - OpenAI
 - Vertex AI
 Add `finetune_settings` and `files_settings` to your litellm config.yaml to use the fine-tuning endpoints.
 ## Example config.yaml for `finetune_settings` and `files_settings`
 ```yaml
 model_list:
  - model_name: gpt-4
    litellm_params:
      model: openai/fake
      api_key: fake-key
      api_base: https://exampleopenaiendpoint-production.up.railway.app/
 # For /fine_tuning/jobs endpoints
 finetune_settings:
  - custom_llm_provider: azure
    api_base: https://exampleopenaiendpoint-production.up.railway.app
    api_key: os.environ/AZURE_API_KEY
    api_version: "2023-03-15-preview"
  - custom_llm_provider: openai
    api_key: os.environ/OPENAI_API_KEY
  - custom_llm_provider: "vertex_ai"
    vertex_project: "adroit-crow-413218"
    vertex_location: "us-central1"
    vertex_credentials: "/Users/ishaanjaffer/Downloads/adroit-crow-413218-a956eef1a2a8.json"
 # for /files endpoints
 files_settings:
  - custom_llm_provider: azure
    api_base: https://exampleopenaiendpoint-production.up.railway.app
    api_key: fake-key
    api_version: "2023-03-15-preview"
  - custom_llm_provider: openai
    api_key: os.environ/OPENAI_API_KEY
 ```
 ## Create File for fine-tuning
 <Tabs>
 <TabItem value="openai" label="OpenAI Python SDK">
 ```python
 client = AsyncOpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000") # base_url is your litellm proxy url
 file_name = "openai_batch_completions.jsonl"
 response = await client.files.create(
    extra_body={"custom_llm_provider": "azure"}, # tell litellm proxy which provider to use
    file=open(file_name, "rb"),
    purpose="fine-tune",
 )
 ```
 </TabItem>
 <TabItem value="curl" label="curl">
 ```shell
 curl http://localhost:4000/v1/files \
    -H "Authorization: Bearer sk-1234" \
    -F purpose="batch" \
    -F custom_llm_provider="azure"\
    -F file="@mydata.jsonl"
 ```
 </TabItem>
 </Tabs>
 ## Create fine-tuning job
 <Tabs>
 <TabItem value="azure" label="Azure OpenAI">
 <Tabs>
 <TabItem value="openai" label="OpenAI Python SDK">
 ```python
 ft_job = await client.fine_tuning.jobs.create(
    model="gpt-35-turbo-1106",                   # Azure OpenAI model you want to fine-tune
    training_file="file-abc123",                 # file_id from create file response
    extra_body={"custom_llm_provider": "azure"}, # tell litellm proxy which provider to use
 )
 ```
 </TabItem>
 <TabItem value="curl" label="curl">
 ```shell
 curl http://localhost:4000/v1/fine_tuning/jobs \
    -H "Content-Type: application/json" \
    -H "Authorization: Bearer sk-1234" \
    -d '{
    "custom_llm_provider": "azure",
    "model": "gpt-35-turbo-1106",
    "training_file": "file-abc123"
    }'
 ```
 </TabItem>
 </Tabs>
 </TabItem>
 <TabItem value="Vertex" label="VertexAI">
 <Tabs>
 <TabItem value="openai" label="OpenAI Python SDK">
 ```python
 ft_job = await client.fine_tuning.jobs.create(
    model="gemini-1.0-pro-002",                  # Vertex model you want to fine-tune
    training_file="gs://cloud-samples-data/ai-platform/generative_ai/sft_train_data.jsonl",                 # file_id from create file response
    extra_body={"custom_llm_provider": "vertex_ai"}, # tell litellm proxy which provider to use
 )
 ```
 </TabItem>
 <TabItem value="curl" label="curl (Unified API)">
 ```shell
 curl http://localhost:4000/v1/fine_tuning/jobs \
    -H "Content-Type: application/json" \
    -H "Authorization: Bearer sk-1234" \
    -d '{
    "custom_llm_provider": "vertex_ai",
    "model": "gemini-1.0-pro-002",
    "training_file": "gs://cloud-samples-data/ai-platform/generative_ai/sft_train_data.jsonl"
    }'
 ```
 </TabItem>
 <TabItem value="curl-vtx" label="curl (VertexAI API)">
 :::info
 Use this to create Fine tuning Jobs in [the Vertex AI API Format](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/tuning#create-tuning)
 :::
 ```shell
 curl http://localhost:4000/v1/projects/tuningJobs \
      -H "Content-Type: application/json" \
      -H "Authorization: Bearer sk-1234" \
      -d '{
  "baseModel": "gemini-1.0-pro-002",
  "supervisedTuningSpec" : {
      "training_dataset_uri": "gs://cloud-samples-data/ai-platform/generative_ai/sft_train_data.jsonl"
  }
 }'
 ```
 </TabItem>
 </Tabs>
 </TabItem>
 </Tabs>
 ### Request Body
 <Tabs>
 <TabItem value="params" label="Supported Params">
 * `model`
    **Type:** string  
    **Required:** Yes  
    The name of the model to fine-tune
 * `custom_llm_provider`
    **Type:** `Literal["azure", "openai", "vertex_ai"]`
    **Required:** Yes
    The name of the model to fine-tune. You can select one of the [**supported providers**](#supported-providers)
 * `training_file`
    **Type:** string  
    **Required:** Yes  
    The ID of an uploaded file that contains training data.
    - See **upload file** for how to upload a file.
    - Your dataset must be formatted as a JSONL file.
 * `hyperparameters`
    **Type:** object  
    **Required:** No  
    The hyperparameters used for the fine-tuning job.
    > #### Supported `hyperparameters`
    > #### batch_size
    **Type:** string or integer  
    **Required:** No  
    Number of examples in each batch. A larger batch size means that model parameters are updated less frequently, but with lower variance.
    > #### learning_rate_multiplier
    **Type:** string or number  
    **Required:** No  
    Scaling factor for the learning rate. A smaller learning rate may be useful to avoid overfitting.
    > #### n_epochs
    **Type:** string or integer  
    **Required:** No  
    The number of epochs to train the model for. An epoch refers to one full cycle through the training dataset.
 * `suffix`
    **Type:** string or null  
    **Required:** No  
    **Default:** null  
    A string of up to 18 characters that will be added to your fine-tuned model name.
    Example: A `suffix` of "custom-model-name" would produce a model name like `ft:gpt-4o-mini:openai:custom-model-name:7p4lURel`.
 * `validation_file`
    **Type:** string or null  
    **Required:** No  
    The ID of an uploaded file that contains validation data.
    - If provided, this data is used to generate validation metrics periodically during fine-tuning.
 * `integrations`
    **Type:** array or null  
    **Required:** No  
    A list of integrations to enable for your fine-tuning job.
 * `seed`
    **Type:** integer or null  
    **Required:** No  
    The seed controls the reproducibility of the job. Passing in the same seed and job parameters should produce the same results, but may differ in rare cases. If a seed is not specified, one will be generated for you.
 </TabItem>
 <TabItem value="example" label="Example Request Body">
 ```json
 {
  "model": "gpt-4o-mini",
  "training_file": "file-abcde12345",
  "hyperparameters": {
    "batch_size": 4,
    "learning_rate_multiplier": 0.1,
    "n_epochs": 3
  },
  "suffix": "custom-model-v1",
  "validation_file": "file-fghij67890",
  "seed": 42
 }
 ```
 </TabItem>
 </Tabs>
 ## Cancel fine-tuning job
 <Tabs>
 <TabItem value="openai" label="OpenAI Python SDK">
 ```python
 # cancel specific fine tuning job
 cancel_ft_job = await client.fine_tuning.jobs.cancel(
    fine_tuning_job_id="123",                          # fine tuning job id
    extra_body={"custom_llm_provider": "azure"},       # tell litellm proxy which provider to use
 )
 print("response from cancel ft job={}".format(cancel_ft_job))
 ```
 </TabItem>
 <TabItem value="curl" label="curl">
 ```shell
 curl -X POST http://localhost:4000/v1/fine_tuning/jobs/ftjob-abc123/cancel \
  -H "Authorization: Bearer sk-1234" \
  -H "Content-Type: application/json" \
  -d '{"custom_llm_provider": "azure"}'
 ```
 </TabItem>
 </Tabs>
 ## List fine-tuning jobs
 <Tabs>
 <TabItem value="openai" label="OpenAI Python SDK">
 ```python
 list_ft_jobs = await client.fine_tuning.jobs.list(
    extra_query={"custom_llm_provider": "azure"}   # tell litellm proxy which provider to use
 )
 print("list of ft jobs={}".format(list_ft_jobs))
 ```
 </TabItem>
 <TabItem value="curl" label="curl">
 ```shell
 curl -X GET 'http://localhost:4000/v1/fine_tuning/jobs?custom_llm_provider=azure' \
     -H "Content-Type: application/json" \
     -H "Authorization: Bearer sk-1234"
 ```
 </TabItem>
 </Tabs>
 ## [👉 Proxy API Reference](https://litellm-api.up.railway.app/#/fine-tuning)
--- a/docs/my-website/docs/getting_started.md
+++ b/docs/my-website/docs/getting_started.md
@ -87,13 +87,14 @@ from litellm import completion
 ## set env variables for logging tools
 os.environ["LUNARY_PUBLIC_KEY"] = "your-lunary-public-key"
 os.environ["HELICONE_API_KEY"] = "your-helicone-key"
 os.environ["LANGFUSE_PUBLIC_KEY"] = ""
 os.environ["LANGFUSE_SECRET_KEY"] = ""
 os.environ["OPENAI_API_KEY"]
 # set callbacks
-litellm.success_callback = ["lunary", "langfuse"] # log input/output to langfuse, lunary, supabase
+litellm.success_callback = ["lunary", "langfuse", "helicone"] # log input/output to langfuse, lunary, supabase, helicone
 #openai call
 response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])
--- a/docs/my-website/docs/index.md
+++ b/docs/my-website/docs/index.md
@ -10,14 +10,41 @@ https://github.com/BerriAI/litellm
 - Translate inputs to provider's `completion`, `embedding`, and `image_generation` endpoints
 - [Consistent output](https://docs.litellm.ai/docs/completion/output), text responses will always be available at `['choices'][0]['message']['content']`
 - Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing)
- Track spend & set budgets per project [OpenAI Proxy Server](https://docs.litellm.ai/docs/simple_proxy)
+- Track spend & set budgets per project [LiteLLM Proxy Server](https://docs.litellm.ai/docs/simple_proxy)
 ## How to use LiteLLM
 You can use litellm through either:
-1. [OpenAI proxy Server](#openai-proxy) - Server to call 100+ LLMs, load balance, cost tracking across projects
+1. [LiteLLM Proxy Server](#openai-proxy) - Server (LLM Gateway) to call 100+ LLMs, load balance, cost tracking across projects
 2. [LiteLLM python SDK](#basic-usage) - Python Client to call 100+ LLMs, load balance, cost tracking
-## LiteLLM Python SDK
+### **When to use LiteLLM Proxy Server (LLM Gateway)**
 :::tip
 Use LiteLLM Proxy Server if you want a **central service (LLM Gateway) to access multiple LLMs**
 Typically used by Gen AI Enablement /  ML PLatform Teams
 :::
  - LiteLLM Proxy gives you a unified interface to access multiple LLMs (100+ LLMs)
  - Track LLM Usage and setup guardrails
  - Customize Logging, Guardrails, Caching per project
 ### **When to use LiteLLM Python SDK**
 :::tip
  Use LiteLLM Python SDK if you want to use LiteLLM in your **python code**
 Typically used by developers building llm projects
 :::
  - LiteLLM SDK gives you a unified interface to access multiple LLMs (100+ LLMs) 
  - Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing)
 ## **LiteLLM Python SDK**
 ### Basic usage 
@ -310,6 +337,7 @@ LiteLLM exposes pre defined callbacks to send data to Lunary, Langfuse, Helicone
 from litellm import completion
 ## set env variables for logging tools
 os.environ["HELICONE_API_KEY"] = "your-helicone-key"
 os.environ["LANGFUSE_PUBLIC_KEY"] = ""
 os.environ["LANGFUSE_SECRET_KEY"] = ""
 os.environ["LUNARY_PUBLIC_KEY"] = "your-lunary-public-key"
@ -317,7 +345,7 @@ os.environ["LUNARY_PUBLIC_KEY"] = "your-lunary-public-key"
 os.environ["OPENAI_API_KEY"]
 # set callbacks
-litellm.success_callback = ["lunary", "langfuse"] # log input/output to lunary, langfuse, supabase
+litellm.success_callback = ["lunary", "langfuse", "helicone"] # log input/output to lunary, langfuse, supabase, helicone
 #openai call
 response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])
@ -356,7 +384,7 @@ response = completion(
 )
 ```
-## OpenAI Proxy
+## **LiteLLM Proxy Server (LLM Gateway)**
 Track spend across multiple projects/people
--- a/docs/my-website/docs/load_test.md
+++ b/docs/my-website/docs/load_test.md
@ -1,6 +1,6 @@
 import Image from '@theme/IdealImage';
-# 🔥 Load Test LiteLLM 
+# Load Test LiteLLM 
 ## How to run a locust load test on LiteLLM Proxy 
--- a/docs/my-website/docs/migration_policy.md
+++ b/docs/my-website/docs/migration_policy.md
@ -0,0 +1,20 @@
 # Migration Policy
 ## New Beta Feature Introduction
 - If we introduce a new feature that may move to the Enterprise Tier it will be clearly labeled as **Beta**. With the following example disclaimer
 **Example Disclaimer**
 :::info
 Beta Feature -  This feature might move to LiteLLM Enterprise
 :::
 ## Policy if a Beta Feature moves to Enterprise
 If we decide to move a beta feature to the paid Enterprise version we will:
 - Provide **at least 30 days** notice to all users of the beta feature
 - Provide **a free 3 month License to prevent any disruptions to production**
 - Provide a **dedicated slack, discord, microsoft teams support channel** to help your team during this transition
--- a/docs/my-website/docs/observability/arize_integration.md
+++ b/docs/my-website/docs/observability/arize_integration.md
@ -0,0 +1,72 @@
 import Image from '@theme/IdealImage';
 # Arize AI
 AI Observability and Evaluation Platform
 :::tip
 This is community maintained, Please make an issue if you run into a bug
 https://github.com/BerriAI/litellm
 :::
 ## Pre-Requisites
 Make an account on [Arize AI](https://app.arize.com/auth/login)
 ## Quick Start
 Use just 2 lines of code, to instantly log your responses **across all providers** with arize
 ```python
 litellm.callbacks = ["arize"]
 ```
 ```python
 import litellm
 import os
 os.environ["ARIZE_SPACE_KEY"] = ""
 os.environ["ARIZE_API_KEY"] = "" # defaults to litellm-completion
 # LLM API Keys
 os.environ['OPENAI_API_KEY']=""
 # set arize as a callback, litellm will send the data to arize
 litellm.callbacks = ["arize"]
 # openai call
 response = litellm.completion(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "user", "content": "Hi 👋 - i'm openai"}
  ]
 )
 ```
 ### Using with LiteLLM Proxy
 ```yaml
 model_list:
  - model_name: gpt-4
    litellm_params:
      model: openai/fake
      api_key: fake-key
      api_base: https://exampleopenaiendpoint-production.up.railway.app/
 litellm_settings:
  callbacks: ["arize"]
 environment_variables:
    ARIZE_SPACE_KEY: "d0*****"
    ARIZE_API_KEY: "141a****"
 ```
 ## Support & Talk to Founders
 - [Schedule Demo 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version)
 - [Community Discord 💭](https://discord.gg/wuPM9dRgDw)
 - Our numbers 📞 +1 (770) 8783-106 / ‭+1 (412) 618-6238‬
 - Our emails ✉️ ishaan@berri.ai / krrish@berri.ai
--- a/docs/my-website/docs/observability/braintrust.md
+++ b/docs/my-website/docs/observability/braintrust.md
@ -0,0 +1,147 @@
 import Image from '@theme/IdealImage';
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # Braintrust - Evals + Logging 
 [Braintrust](https://www.braintrust.dev/) manages evaluations, logging, prompt playground, to data management for AI products.
 ## Quick Start
 ```python
 # pip install langfuse 
 import litellm
 import os
 # set env 
 os.environ["BRAINTRUST_API_KEY"] = "" 
 os.environ['OPENAI_API_KEY']=""
 # set braintrust as a callback, litellm will send the data to braintrust
 litellm.callbacks = ["braintrust"] 
 # openai call
 response = litellm.completion(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "user", "content": "Hi 👋 - i'm openai"}
  ]
 )
 ```
 ## OpenAI Proxy Usage
 1. Add keys to env 
 ```env
 BRAINTRUST_API_KEY="" 
 ```
 2. Add braintrust to callbacks 
 ```yaml
 model_list:
  - model_name: gpt-3.5-turbo
    litellm_params:
      model: gpt-3.5-turbo
      api_key: os.environ/OPENAI_API_KEY
 litellm_settings:
  callbacks: ["braintrust"]
 ```
 3. Test it! 
 ```bash
 curl -X POST 'http://0.0.0.0:4000/chat/completions' \
 -H 'Content-Type: application/json' \
 -H 'Authorization: Bearer sk-1234' \
 -D '{
    "model": "groq-llama3",
    "messages": [
        { "role": "system", "content": "Use your tools smartly"},
        { "role": "user", "content": "What time is it now? Use your tool"}
    ]
 }'
 ```
 ## Advanced - pass Project ID 
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 response = litellm.completion(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "user", "content": "Hi 👋 - i'm openai"}
  ], 
  metadata={
    "project_id": "my-special-project" 
  }
 )
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 **Curl**
 ```bash
 curl -X POST 'http://0.0.0.0:4000/chat/completions' \
 -H 'Content-Type: application/json' \
 -H 'Authorization: Bearer sk-1234' \
 -D '{
    "model": "groq-llama3",
    "messages": [
        { "role": "system", "content": "Use your tools smartly"},
        { "role": "user", "content": "What time is it now? Use your tool"}
    ],
    "metadata": {
        "project_id": "my-special-project"
    }
 }'
 ```
 **OpenAI SDK**
 ```python
 import openai
 client = openai.OpenAI(
    api_key="anything",
    base_url="http://0.0.0.0:4000"
 )
 # request sent to model set on litellm proxy, `litellm --model`
 response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages = [
        {
            "role": "user",
            "content": "this is a test request, write a short poem"
        }
    ],
    extra_body={ # pass in any provider-specific param, if not supported by openai, https://docs.litellm.ai/docs/completion/input#provider-specific-params
        "metadata": { # 👈 use for logging additional params (e.g. to langfuse)
            "project_id": "my-special-project"
        }
    }
 )
 print(response)
 ```
 For more examples, [**Click Here**](../proxy/user_keys.md#chatcompletions)
 </TabItem>
 </Tabs>
 ## Full API Spec 
 Here's everything you can pass in metadata for a braintrust request 
 `braintrust_*` - any metadata field starting with `braintrust_` will be passed as metadata to the logging request 
 `project_id`  - set the project id for a braintrust call. Default is `litellm`. 
--- a/docs/my-website/docs/observability/gcs_bucket_integration.md
+++ b/docs/my-website/docs/observability/gcs_bucket_integration.md
@ -0,0 +1,127 @@
 import Image from '@theme/IdealImage';
 # Google Cloud Storage Buckets
 Log LLM Logs to [Google Cloud Storage Buckets](https://cloud.google.com/storage?hl=en)
 :::info
 ✨ This is an Enterprise only feature [Get Started with Enterprise here](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
 :::
 ### Usage
 1. Add `gcs_bucket` to LiteLLM Config.yaml
 ```yaml
 model_list:
 - litellm_params:
    api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
    api_key: my-fake-key
    model: openai/my-fake-model
  model_name: fake-openai-endpoint
 litellm_settings:
  callbacks: ["gcs_bucket"] # 👈 KEY CHANGE # 👈 KEY CHANGE
 ```
 2. Set required env variables
 ```shell
 GCS_BUCKET_NAME="<your-gcs-bucket-name>"
 GCS_PATH_SERVICE_ACCOUNT="/Users/ishaanjaffer/Downloads/adroit-crow-413218-a956eef1a2a8.json" # Add path to service account.json
 ```
 3. Start Proxy
 ```
 litellm --config /path/to/config.yaml
 ```
 4. Test it! 
 ```bash
 curl --location 'http://0.0.0.0:4000/chat/completions' \
 --header 'Content-Type: application/json' \
 --data ' {
      "model": "fake-openai-endpoint",
      "messages": [
        {
          "role": "user",
          "content": "what llm are you"
        }
      ],
    }
 '
 ```
 ## Expected Logs on GCS Buckets
 <Image img={require('../../img/gcs_bucket.png')} />
 ### Fields Logged on GCS Buckets
 Example payload of a `/chat/completion` request logged on GCS
 ```json
 {
  "request_kwargs": {
    "model": "gpt-3.5-turbo",
    "messages": [
      {
        "role": "user",
        "content": "This is a test"
      }
    ],
    "optional_params": {
      "temperature": 0.7,
      "max_tokens": 10,
      "user": "ishaan-2",
      "extra_body": {}
    }
  },
  "response_obj": {
    "id": "chatcmpl-bd836a8c-89bc-4abd-bee5-e3f1ebfdb541",
    "choices": [
      {
        "finish_reason": "stop",
        "index": 0,
        "message": {
          "content": "Hi!",
          "role": "assistant",
          "tool_calls": null,
          "function_call": null
        }
      }
    ],
    "created": 1722868456,
    "model": "gpt-3.5-turbo",
    "object": "chat.completion",
    "system_fingerprint": null,
    "usage": {
      "prompt_tokens": 10,
      "completion_tokens": 20,
      "total_tokens": 30
    }
  },
  "start_time": "2024-08-05 07:34:16",
  "end_time": "2024-08-05 07:34:16"
 }
 ```
 ## Getting `service_account.json` from Google Cloud Console
 1. Go to [Google Cloud Console](https://console.cloud.google.com/)
 2. Search for IAM & Admin
 3. Click on Service Accounts
 4. Select a Service Account
 5. Click on 'Keys' -> Add Key -> Create New Key -> JSON
 6. Save the JSON file and add the path to `GCS_PATH_SERVICE_ACCOUNT`
 ## Support & Talk to Founders
 - [Schedule Demo 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version)
 - [Community Discord 💭](https://discord.gg/wuPM9dRgDw)
 - Our numbers 📞 +1 (770) 8783-106 / ‭+1 (412) 618-6238‬
 - Our emails ✉️ ishaan@berri.ai / krrish@berri.ai
--- a/docs/my-website/docs/observability/helicone_integration.md
+++ b/docs/my-website/docs/observability/helicone_integration.md
@ -1,64 +1,170 @@
-# Helicone Tutorial 
+# Helicone - OSS LLM Observability Platform
 :::tip
-This is community maintained, Please make an issue if you run into a bug
+This is community maintained. Please make an issue if you run into a bug:
 https://github.com/BerriAI/litellm
 :::
 [Helicone](https://helicone.ai/) is an open source observability platform that proxies your LLM requests and provides key insights into your usage, spend, latency and more.
-[Helicone](https://helicone.ai/) is an open source observability platform that proxies your OpenAI traffic and provides you key insights into your spend, latency and usage.
+## Using Helicone with LiteLLM
-## Use Helicone to log requests across all LLM Providers (OpenAI, Azure, Anthropic, Cohere, Replicate, PaLM)
+LiteLLM provides `success_callbacks` and `failure_callbacks`, allowing you to easily log data to Helicone based on the status of your responses.
 liteLLM provides `success_callbacks` and `failure_callbacks`, making it easy for you to send data to a particular provider depending on the status of your responses. 
-In this case, we want to log requests to Helicone when a request succeeds. 
+### Supported LLM Providers
 Helicone can log requests across [various LLM providers](https://docs.helicone.ai/getting-started/quick-start), including:
 - OpenAI
 - Azure
 - Anthropic
 - Gemini
 - Groq
 - Cohere
 - Replicate
 - And more
 ### Integration Methods
 There are two main approaches to integrate Helicone with LiteLLM:
 1. Using callbacks
 2. Using Helicone as a proxy
 Let's explore each method in detail.
 ### Approach 1: Use Callbacks
-Use just 1 line of code, to instantly log your responses **across all providers** with helicone: 
+
 Use just 1 line of code to instantly log your responses **across all providers** with Helicone:
 ```python
-litellm.success_callback=["helicone"]
+litellm.success_callback = ["helicone"]
 ```
 Complete code
 ```python
 from litellm import completion
 ## set env variables
 os.environ["HELICONE_API_KEY"] = "your-helicone-key" 
 os.environ["OPENAI_API_KEY"], os.environ["COHERE_API_KEY"] = "", ""
 # set callbacks
 litellm.success_callback=["helicone"]
 #openai call
 response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}]) 
 #cohere call
 response = completion(model="command-nightly", messages=[{"role": "user", "content": "Hi 👋 - i'm cohere"}]) 
 ```
 ### Approach 2: [OpenAI + Azure only] Use Helicone as a proxy
 Helicone provides advanced functionality like caching, etc. Helicone currently supports this for Azure and OpenAI.
 If you want to use Helicone to proxy your OpenAI/Azure requests, then you can - 
 - Set helicone as your base url via: `litellm.api_url` 
 - Pass in helicone request headers via: `litellm.headers` 
 Complete Code
 ```python
-import litellm
+import os
 from litellm import completion
-litellm.api_base = "https://oai.hconeai.com/v1"
+## Set env variables
-litellm.headers = {"Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}"}
+os.environ["HELICONE_API_KEY"] = "your-helicone-key"
 os.environ["OPENAI_API_KEY"] = "your-openai-key"
-response = litellm.completion(
+# Set callbacks
-    model="gpt-3.5-turbo",
+litellm.success_callback = ["helicone"]
-    messages=[{"role": "user", "content": "how does a court case get to the Supreme Court?"}]
+
 # OpenAI call
 response = completion(
    model="gpt-4o",
    messages=[{"role": "user", "content": "Hi 👋 - I'm OpenAI"}],
 )
 print(response)
 ```
 ### Approach 2: Use Helicone as a proxy
 Helicone's proxy provides [advanced functionality](https://docs.helicone.ai/getting-started/proxy-vs-async) like caching, rate limiting, LLM security through [PromptArmor](https://promptarmor.com/) and more.
 To use Helicone as a proxy for your LLM requests:
 1. Set Helicone as your base URL via: litellm.api_base
 2. Pass in Helicone request headers via: litellm.metadata
 Complete Code:
 ```python
 import os
 import litellm
 from litellm import completion
 litellm.api_base = "https://oai.hconeai.com/v1"
 litellm.headers = {
    "Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}",  # Authenticate to send requests to Helicone API
 }
 response = litellm.completion(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": "How does a court case get to the Supreme Court?"}]
 )
 print(response)
 ```
 ### Advanced Usage
 You can add custom metadata and properties to your requests using Helicone headers. Here are some examples:
 ```python
 litellm.metadata = {
    "Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}",  # Authenticate to send requests to Helicone API
    "Helicone-User-Id": "user-abc",  # Specify the user making the request
    "Helicone-Property-App": "web",  # Custom property to add additional information
    "Helicone-Property-Custom": "any-value",  # Add any custom property
    "Helicone-Prompt-Id": "prompt-supreme-court",  # Assign an ID to associate this prompt with future versions
    "Helicone-Cache-Enabled": "true",  # Enable caching of responses
    "Cache-Control": "max-age=3600",  # Set cache limit to 1 hour
    "Helicone-RateLimit-Policy": "10;w=60;s=user",  # Set rate limit policy
    "Helicone-Retry-Enabled": "true",  # Enable retry mechanism
    "helicone-retry-num": "3",  # Set number of retries
    "helicone-retry-factor": "2",  # Set exponential backoff factor
    "Helicone-Model-Override": "gpt-3.5-turbo-0613",  # Override the model used for cost calculation
    "Helicone-Session-Id": "session-abc-123",  # Set session ID for tracking
    "Helicone-Session-Path": "parent-trace/child-trace",  # Set session path for hierarchical tracking
    "Helicone-Omit-Response": "false",  # Include response in logging (default behavior)
    "Helicone-Omit-Request": "false",  # Include request in logging (default behavior)
    "Helicone-LLM-Security-Enabled": "true",  # Enable LLM security features
    "Helicone-Moderations-Enabled": "true",  # Enable content moderation
    "Helicone-Fallbacks": '["gpt-3.5-turbo", "gpt-4"]',  # Set fallback models
 }
 ```
 ### Caching and Rate Limiting
 Enable caching and set up rate limiting policies:
 ```python
 litellm.metadata = {
    "Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}",  # Authenticate to send requests to Helicone API
    "Helicone-Cache-Enabled": "true",  # Enable caching of responses
    "Cache-Control": "max-age=3600",  # Set cache limit to 1 hour
    "Helicone-RateLimit-Policy": "100;w=3600;s=user",  # Set rate limit policy
 }
 ```
 ### Session Tracking and Tracing
 Track multi-step and agentic LLM interactions using session IDs and paths:
 ```python
 litellm.metadata = {
    "Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}",  # Authenticate to send requests to Helicone API
    "Helicone-Session-Id": "session-abc-123",  # The session ID you want to track
    "Helicone-Session-Path": "parent-trace/child-trace",  # The path of the session
 }
 ```
 - `Helicone-Session-Id`: Use this to specify the unique identifier for the session you want to track. This allows you to group related requests together.
 - `Helicone-Session-Path`: This header defines the path of the session, allowing you to represent parent and child traces. For example, "parent/child" represents a child trace of a parent trace.
 By using these two headers, you can effectively group and visualize multi-step LLM interactions, gaining insights into complex AI workflows.
 ### Retry and Fallback Mechanisms
 Set up retry mechanisms and fallback options:
 ```python
 litellm.metadata = {
    "Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}",  # Authenticate to send requests to Helicone API
    "Helicone-Retry-Enabled": "true",  # Enable retry mechanism
    "helicone-retry-num": "3",  # Set number of retries
    "helicone-retry-factor": "2",  # Set exponential backoff factor
    "Helicone-Fallbacks": '["gpt-3.5-turbo", "gpt-4"]',  # Set fallback models
 }
 ```
 > **Supported Headers** - For a full list of supported Helicone headers and their descriptions, please refer to the [Helicone documentation](https://docs.helicone.ai/getting-started/quick-start).
 > By utilizing these headers and metadata options, you can gain deeper insights into your LLM usage, optimize performance, and better manage your AI workflows with Helicone and LiteLLM.
--- a/docs/my-website/docs/observability/langfuse_integration.md
+++ b/docs/my-website/docs/observability/langfuse_integration.md
@ -1,6 +1,6 @@
 import Image from '@theme/IdealImage';
-# 🔥 Langfuse - Logging LLM Input/Output
+# 🪢 Langfuse - Logging LLM Input/Output
 LangFuse is open Source Observability & Analytics for LLM Apps
 Detailed production traces and a granular view on quality, cost and latency
@ -200,6 +200,13 @@ The following parameters can be updated on a continuation of a trace by passing
 Any other key value pairs passed into the metadata not listed in the above spec for a `litellm` completion will be added as a metadata key value pair for the generation.
 #### Disable Logging - Specific Calls
 To disable logging for specific calls use the `no-log` flag. 
 `completion(messages = ..., model = ...,  **{"no-log": True})`
 ### Use LangChain ChatLiteLLM + Langfuse
 Pass `trace_user_id`, `session_id` in model_kwargs
 ```python
--- a/docs/my-website/docs/observability/langsmith_integration.md
+++ b/docs/my-website/docs/observability/langsmith_integration.md
@ -14,7 +14,7 @@ https://github.com/BerriAI/litellm
 An all-in-one developer platform for every step of the application lifecycle
 https://smith.langchain.com/
-<Image img={require('../../img/langsmith.png')} />
+<Image img={require('../../img/langsmith_new.png')} />
 :::info
 We want to learn how we can make the callbacks better! Meet the LiteLLM [founders](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version) or
@ -56,7 +56,7 @@ response = litellm.completion(
 ```
 ## Advanced
-### Set Custom Project & Run names
+### Set Langsmith fields - Custom Projec, Run names, tags
 ```python
 import litellm
@ -77,6 +77,7 @@ response = litellm.completion(
    metadata={
        "run_name": "litellmRUN",               # langsmith run name
        "project_name": "litellm-completion",   # langsmith project name
        "tags": ["model1", "prod-2"]            # tags to log on langsmith
    }
 )
 print(response)
--- a/docs/my-website/docs/observability/logfire_integration.md
+++ b/docs/my-website/docs/observability/logfire_integration.md
@ -1,6 +1,6 @@
 import Image from '@theme/IdealImage';
-# 🔥 Logfire - Logging LLM Input/Output
+# Logfire
 Logfire is open Source Observability & Analytics for LLM Apps
 Detailed production traces and a granular view on quality, cost and latency
--- a/docs/my-website/docs/observability/raw_request_response.md
+++ b/docs/my-website/docs/observability/raw_request_response.md
@ -1,10 +1,16 @@
 import Image from '@theme/IdealImage';
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # Raw Request/Response Logging
 ## Logging
 See the raw request/response sent by LiteLLM in your logging provider (OTEL/Langfuse/etc.).
-**on SDK**
+<Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 # pip install langfuse 
 import litellm
@ -34,13 +40,85 @@ response = litellm.completion(
 )
 ```
-**on Proxy**
+
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 ```yaml
 litellm_settings:
  log_raw_request_response: True
 ```
 </TabItem>
 </Tabs>
 **Expected Log**
 <Image img={require('../../img/raw_request_log.png')}/>
 ## Return Raw Response Headers 
 Return raw response headers from llm provider. 
 Currently only supported for openai. 
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 import litellm
 import os
 litellm.return_response_headers = True
 ## set ENV variables
 os.environ["OPENAI_API_KEY"] = "your-api-key"
 response = litellm.completion(
  model="gpt-3.5-turbo",
  messages=[{ "content": "Hello, how are you?","role": "user"}]
 )
 print(response._hidden_params)
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 1. Setup config.yaml
 ```yaml
 model_list:
  - model_name: gpt-3.5-turbo
    litellm_params:
      model: gpt-3.5-turbo
      api_key: os.environ/GROQ_API_KEY
 litellm_settings:
  return_response_headers: true
 ```
 2. Test it!
 ```bash
 curl -X POST 'http://0.0.0.0:4000/chat/completions' \
 -H 'Content-Type: application/json' \
 -H 'Authorization: Bearer sk-1234' \
 -D '{
    "model": "gpt-3.5-turbo",
    "messages": [
        { "role": "system", "content": "Use your tools smartly"},
        { "role": "user", "content": "What time is it now? Use your tool"}
    ]
 }'
 ```
 </TabItem>
 </Tabs>
 **Expected Response**
 <Image img={require('../../img/raw_response_headers.png')}/>
--- a/docs/my-website/docs/observability/scrub_data.md
+++ b/docs/my-website/docs/observability/scrub_data.md
@ -0,0 +1,97 @@
 # Scrub Logged Data
 Redact messages / mask PII before sending data to logging integrations (langfuse/etc.).
 See our [**Presidio PII Masking**](https://github.com/BerriAI/litellm/blob/a176feeacc5fdf504747978d82056eb84679c4be/litellm/proxy/hooks/presidio_pii_masking.py#L286) for reference.
 1. Setup a custom callback 
 ```python
 from litellm.integrations.custom_logger import CustomLogger
 class MyCustomHandler(CustomLogger):
    async def async_logging_hook(
        self, kwargs: dict, result: Any, call_type: str
    ) -> Tuple[dict, Any]:
        """
        For masking logged request/response. Return a modified version of the request/result. 
        Called before `async_log_success_event`.
        """
        if (
            call_type == "completion" or call_type == "acompletion"
        ):  # /chat/completions requests
            messages: Optional[List] = kwargs.get("messages", None)
            kwargs["messages"] = [{"role": "user", "content": "MASK_THIS_ASYNC_VALUE"}]
        return kwargs, responses
    def logging_hook(
        self, kwargs: dict, result: Any, call_type: str
    ) -> Tuple[dict, Any]:
        """
        For masking logged request/response. Return a modified version of the request/result.
        Called before `log_success_event`.
        """
        if (
            call_type == "completion" or call_type == "acompletion"
        ):  # /chat/completions requests
            messages: Optional[List] = kwargs.get("messages", None)
            kwargs["messages"] = [{"role": "user", "content": "MASK_THIS_SYNC_VALUE"}]
        return kwargs, responses
 customHandler = MyCustomHandler()
 ```
 2. Connect custom handler to LiteLLM
 ```python
 import litellm
 litellm.callbacks = [customHandler]
 ```
 3. Test it!
 ```python
 # pip install langfuse 
 import os
 import litellm
 from litellm import completion 
 os.environ["LANGFUSE_PUBLIC_KEY"] = ""
 os.environ["LANGFUSE_SECRET_KEY"] = ""
 # Optional, defaults to https://cloud.langfuse.com
 os.environ["LANGFUSE_HOST"] # optional
 # LLM API Keys
 os.environ['OPENAI_API_KEY']=""
 litellm.callbacks = [customHandler]
 litellm.success_callback = ["langfuse"]
 ## sync 
 response = completion(model="gpt-3.5-turbo", messages=[{ "role": "user", "content": "Hi 👋 - i'm openai"}],
                              stream=True)
 for chunk in response: 
    continue
 ## async
 import asyncio 
 def async completion():
    response = await acompletion(model="gpt-3.5-turbo", messages=[{ "role": "user", "content": "Hi 👋 - i'm openai"}],
                              stream=True)
    async for chunk in response: 
        continue
 asyncio.run(completion())
 ```
--- a/docs/my-website/docs/observability/sentry.md
+++ b/docs/my-website/docs/observability/sentry.md
@ -1,3 +1,4 @@
 # Sentry - Log LLM Exceptions
 import Image from '@theme/IdealImage';
@ -9,7 +10,6 @@ https://github.com/BerriAI/litellm
 :::
 # Sentry - Log LLM Exceptions
 [Sentry](https://sentry.io/) provides error monitoring for production. LiteLLM can add breadcrumbs and send exceptions to Sentry with this integration
 Track exceptions for:
--- a/docs/my-website/docs/oidc.md
+++ b/docs/my-website/docs/oidc.md
@ -0,0 +1,263 @@
 # [BETA] OpenID Connect (OIDC)
 LiteLLM supports using OpenID Connect (OIDC) for authentication to upstream services . This allows you to avoid storing sensitive credentials in your configuration files.
 :::info
 This feature is in Beta
 :::
 ## OIDC Identity Provider (IdP)
 LiteLLM supports the following OIDC identity providers:
 | Provider                 | Config Name  | Custom Audiences |
 | -------------------------| ------------ | ---------------- |
 | Google Cloud Run         | `google`     | Yes              |
 | CircleCI v1              | `circleci`   | No               |
 | CircleCI v2              | `circleci_v2`| No               |
 | GitHub Actions           | `github`     | Yes              |
 | Azure Kubernetes Service | `azure`      | No               |
 | File                     | `file`       | No               |
 | Environment Variable     | `env`        | No               |
 | Environment Path         | `env_path`   | No               |
 If you would like to use a different OIDC provider, please open an issue on GitHub.
 :::tip
 Do not use the `file`, `env`, or `env_path` providers unless you know what you're doing, and you are sure none of the other providers will work for your use-case. Hint: they probably will.
 :::
 ## OIDC Connect Relying Party (RP)
 LiteLLM supports the following OIDC relying parties / clients:
 - Amazon Bedrock
 - Azure OpenAI
 - _(Coming soon) Google Cloud Vertex AI_
 ### Configuring OIDC
 Wherever a secret key can be used, OIDC can be used in-place. The general format is:
 ```
 oidc/config_name_here/audience_here
 ```
 For providers that do not use the `audience` parameter, you can (and should) omit it:
 ```
 oidc/config_name_here/
 ```
 #### Unofficial Providers (not recommended)
 For the unofficial `file` provider, you can use the following format:
 ```
 oidc/file/home/user/dave/this_is_a_file_with_a_token.txt
 ```
 For the unofficial `env`, use the following format, where `SECRET_TOKEN` is the name of the environment variable that contains the token:
 ```
 oidc/env/SECRET_TOKEN
 ```
 For the unofficial `env_path`, use the following format, where `SECRET_TOKEN` is the name of the environment variable that contains the path to the file with the token:
 ```
 oidc/env_path/SECRET_TOKEN
 ```
 :::tip
 If you are tempted to use oidc/env_path/AZURE_FEDERATED_TOKEN_FILE, don't do that. Instead, use `oidc/azure/`, as this will ensure continued support from LiteLLM if Azure changes their OIDC configuration and/or adds new features.
 :::
 ## Examples
 ### Google Cloud Run -> Amazon Bedrock
 ```yaml
 model_list:
  - model_name: claude-3-haiku-20240307
    litellm_params:
      model: bedrock/anthropic.claude-3-haiku-20240307-v1:0
      aws_region_name: us-west-2
      aws_session_name: "litellm"
      aws_role_name: "arn:aws:iam::YOUR_THING_HERE:role/litellm-google-demo"
      aws_web_identity_token: "oidc/google/https://example.com"
 ```
 ### CircleCI v2 -> Amazon Bedrock
 ```yaml
 model_list:
  - model_name: command-r
    litellm_params:
      model: bedrock/cohere.command-r-v1:0
      aws_region_name: us-west-2
      aws_session_name: "my-test-session"
      aws_role_name: "arn:aws:iam::335785316107:role/litellm-github-unit-tests-circleci"
      aws_web_identity_token: "oidc/circleci_v2/"
 ```
 #### Amazon IAM Role Configuration for CircleCI v2 -> Bedrock
 The configuration below is only an example. You should adjust the permissions and trust relationship to match your specific use case.
 Permissions:
 ```json
 {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Sid": "VisualEditor0",
            "Effect": "Allow",
            "Action": [
                "bedrock:InvokeModel",
                "bedrock:InvokeModelWithResponseStream"
            ],
            "Resource": [
                "arn:aws:bedrock:*::foundation-model/anthropic.claude-3-haiku-20240307-v1:0",
                "arn:aws:bedrock:*::foundation-model/cohere.command-r-v1:0"
            ]
        }
    ]
 }
 ```
 See https://docs.aws.amazon.com/bedrock/latest/userguide/security_iam_id-based-policy-examples.html for more examples. 
 Trust Relationship:
 ```json
 {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Principal": {
                "Federated": "arn:aws:iam::335785316107:oidc-provider/oidc.circleci.com/org/c5a99188-154f-4f69-8da2-b442b1bf78dd"
            },
            "Action": "sts:AssumeRoleWithWebIdentity",
            "Condition": {
                "StringEquals": {
                    "oidc.circleci.com/org/c5a99188-154f-4f69-8da2-b442b1bf78dd:aud": "c5a99188-154f-4f69-8da2-b442b1bf78dd"
                },
                "ForAnyValue:StringLike": {
                    "oidc.circleci.com/org/c5a99188-154f-4f69-8da2-b442b1bf78dd:sub": [
                        "org/c5a99188-154f-4f69-8da2-b442b1bf78dd/project/*/user/*/vcs-origin/github.com/BerriAI/litellm/vcs-ref/refs/heads/main",
                        "org/c5a99188-154f-4f69-8da2-b442b1bf78dd/project/*/user/*/vcs-origin/github.com/BerriAI/litellm/vcs-ref/refs/heads/litellm_*"
                    ]
                }
            }
        }
    ]
 }
 ```
 This trust relationship restricts CircleCI to only assume the role on the main branch and branches that start with `litellm_`.
 For CircleCI (v1 and v2), you also need to add your organization's OIDC provider in your AWS IAM settings. See https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_create_for-idp_oidc.html for more information.
 :::tip
 You should _never_ need to create an IAM user. If you did, you're not using OIDC correctly. You should only be creating a role with permissions and a trust relationship to your OIDC provider.
 :::
 ### Google Cloud Run -> Azure OpenAI
 ```yaml
 model_list:
  - model_name: gpt-4o-2024-05-13
    litellm_params:
      model: azure/gpt-4o-2024-05-13
      azure_ad_token: "oidc/google/https://example.com"
      api_version: "2024-06-01"
      api_base: "https://demo-here.openai.azure.com"
    model_info:
      base_model: azure/gpt-4o-2024-05-13
 ```
 For Azure OpenAI, you need to define `AZURE_CLIENT_ID`, `AZURE_TENANT_ID`, and optionally `AZURE_AUTHORITY_HOST` in your environment.
 ```bash
 export AZURE_CLIENT_ID="91a43c21-cf21-4f34-9085-331015ea4f91" # Azure AD Application (Client) ID
 export AZURE_TENANT_ID="f3b1cf79-eba8-40c3-8120-cb26aca169c2" # Will be the same across of all your Azure AD applications
 export AZURE_AUTHORITY_HOST="https://login.microsoftonline.com" # 👈 Optional, defaults to "https://login.microsoftonline.com"
 ```
 :::tip
 You can find `AZURE_CLIENT_ID` by visiting `https://login.microsoftonline.com/YOUR_DOMAIN_HERE/v2.0/.well-known/openid-configuration` and looking for the UUID in the `issuer` field.
 :::
 :::tip
 Don't set `AZURE_AUTHORITY_HOST` in your environment unless you need to override the default value. This way, if the default value changes in the future, you won't need to update your environment.
 :::
 :::tip
 By default, Azure AD applications use the audience `api://AzureADTokenExchange`. We recommend setting the audience to something more specific to your application.
 :::
 #### Azure AD Application Configuration
 Unfortunately, Azure is bit more complicated to set up than other OIDC relying parties like AWS. Basically, you have to:
 1. Create an Azure application.
 2. Add a federated credential for the OIDC IdP you're using (e.g. Google Cloud Run).
 3. Add the Azure application to resource group that contains the Azure OpenAI resource(s).
 4. Give the Azure application the necessary role to access the Azure OpenAI resource(s).
 The custom role below is the recommended minimum permissions for the Azure application to access Azure OpenAI resources. You should adjust the permissions to match your specific use case.
 ```json
 {
    "id": "/subscriptions/24ebb700-ec2f-417f-afad-78fe15dcc91f/providers/Microsoft.Authorization/roleDefinitions/baf42808-99ff-466d-b9da-f95bb0422c5f",
    "properties": {
        "roleName": "invoke-only",
        "description": "",
        "assignableScopes": [
            "/subscriptions/24ebb700-ec2f-417f-afad-78fe15dcc91f/resourceGroups/your-openai-group-name"
        ],
        "permissions": [
            {
                "actions": [],
                "notActions": [],
                "dataActions": [
                    "Microsoft.CognitiveServices/accounts/OpenAI/deployments/audio/action",
                    "Microsoft.CognitiveServices/accounts/OpenAI/deployments/search/action",
                    "Microsoft.CognitiveServices/accounts/OpenAI/deployments/completions/action",
                    "Microsoft.CognitiveServices/accounts/OpenAI/deployments/chat/completions/action",
                    "Microsoft.CognitiveServices/accounts/OpenAI/deployments/extensions/chat/completions/action",
                    "Microsoft.CognitiveServices/accounts/OpenAI/deployments/embeddings/action",
                    "Microsoft.CognitiveServices/accounts/OpenAI/images/generations/action"
                ],
                "notDataActions": []
            }
        ]
    }
 }
 ```
 _Note: Your UUIDs will be different._
 Please contact us for paid enterprise support if you need help setting up Azure AD applications.
--- a/docs/my-website/docs/pass_through/bedrock.md
+++ b/docs/my-website/docs/pass_through/bedrock.md
@ -0,0 +1,236 @@
 # Bedrock (Pass-Through)
 Pass-through endpoints for Bedrock - call provider-specific endpoint, in native format (no translation).
 Just replace `https://bedrock-runtime.{aws_region_name}.amazonaws.com` with `LITELLM_PROXY_BASE_URL/bedrock` 🚀
 #### **Example Usage**
 ```bash
 curl -X POST 'http://0.0.0.0:4000/bedrock/model/cohere.command-r-v1:0/converse' \
 -H 'Authorization: Bearer anything' \
 -H 'Content-Type: application/json' \
 -d '{
    "messages": [
         {"role": "user",
        "content": [{"text": "Hello"}]
    }
    ]
 }'
 ```
 Supports **ALL** Bedrock Endpoints (including streaming).
 [**See All Bedrock Endpoints**](https://docs.aws.amazon.com/bedrock/latest/APIReference/API_runtime_Converse.html)
 ## Quick Start
 Let's call the Bedrock [`/converse` endpoint](https://docs.aws.amazon.com/bedrock/latest/APIReference/API_runtime_Converse.html)
 1. Add AWS Keyss to your environment 
 ```bash
 export AWS_ACCESS_KEY_ID=""  # Access key
 export AWS_SECRET_ACCESS_KEY="" # Secret access key
 export AWS_REGION_NAME="" # us-east-1, us-east-2, us-west-1, us-west-2
 ```
 2. Start LiteLLM Proxy 
 ```bash
 litellm
 # RUNNING on http://0.0.0.0:4000
 ```
 3. Test it! 
 Let's call the Bedrock converse endpoint
 ```bash
 curl -X POST 'http://0.0.0.0:4000/bedrock/model/cohere.command-r-v1:0/converse' \
 -H 'Authorization: Bearer anything' \
 -H 'Content-Type: application/json' \
 -d '{
    "messages": [
         {"role": "user",
        "content": [{"text": "Hello"}]
    }
    ]
 }'
 ```
 ## Examples
 Anything after `http://0.0.0.0:4000/bedrock` is treated as a provider-specific route, and handled accordingly.
 Key Changes: 
 | **Original Endpoint**                                | **Replace With**                  |
 |------------------------------------------------------|-----------------------------------|
 | `https://bedrock-runtime.{aws_region_name}.amazonaws.com`          | `http://0.0.0.0:4000/bedrock` (LITELLM_PROXY_BASE_URL="http://0.0.0.0:4000")      |
 | `AWS4-HMAC-SHA256..`                                 | `Bearer anything` (use `Bearer LITELLM_VIRTUAL_KEY` if Virtual Keys are setup on proxy)                    |
 ### **Example 1: Converse API**
 #### LiteLLM Proxy Call 
 ```bash
 curl -X POST 'http://0.0.0.0:4000/bedrock/model/cohere.command-r-v1:0/converse' \
 -H 'Authorization: Bearer sk-anything' \
 -H 'Content-Type: application/json' \
 -d '{
    "messages": [
         {"role": "user",
        "content": [{"text": "Hello"}]
    }
    ]
 }'
 ```
 #### Direct Bedrock API Call 
 ```bash
 curl -X POST 'https://bedrock-runtime.us-west-2.amazonaws.com/model/cohere.command-r-v1:0/converse' \
 -H 'Authorization: AWS4-HMAC-SHA256..' \
 -H 'Content-Type: application/json' \
 -d '{
    "messages": [
         {"role": "user",
        "content": [{"text": "Hello"}]
    }
    ]
 }'
 ```
 ### **Example 2: Apply Guardrail**
 #### LiteLLM Proxy Call 
 ```bash
 curl "http://0.0.0.0:4000/bedrock/guardrail/guardrailIdentifier/version/guardrailVersion/apply" \
    -H 'Authorization: Bearer sk-anything' \
    -H 'Content-Type: application/json' \
    -X POST \
    -d '{
      "contents": [{"text": {"text": "Hello world"}}],
      "source": "INPUT"
       }'
 ```
 #### Direct Bedrock API Call
 ```bash
 curl "https://bedrock-runtime.us-west-2.amazonaws.com/guardrail/guardrailIdentifier/version/guardrailVersion/apply" \
    -H 'Authorization: AWS4-HMAC-SHA256..' \
    -H 'Content-Type: application/json' \
    -X POST \
    -d '{
      "contents": [{"text": {"text": "Hello world"}}],
      "source": "INPUT"
       }'
 ```
 ### **Example 3: Query Knowledge Base**
 ```bash
 curl -X POST "http://0.0.0.0:4000/bedrock/knowledgebases/{knowledgeBaseId}/retrieve" \
 -H 'Authorization: Bearer sk-anything' \
 -H 'Content-Type: application/json' \
 -d '{
    "nextToken": "string",
    "retrievalConfiguration": { 
        "vectorSearchConfiguration": { 
          "filter": { ... },
          "numberOfResults": number,
          "overrideSearchType": "string"
        }
    },
    "retrievalQuery": { 
        "text": "string"
    }
 }'
 ```
 #### Direct Bedrock API Call 
 ```bash
 curl -X POST "https://bedrock-runtime.us-west-2.amazonaws.com/knowledgebases/{knowledgeBaseId}/retrieve" \
 -H 'Authorization: AWS4-HMAC-SHA256..' \
 -H 'Content-Type: application/json' \
 -d '{
    "nextToken": "string",
    "retrievalConfiguration": { 
        "vectorSearchConfiguration": { 
          "filter": { ... },
          "numberOfResults": number,
          "overrideSearchType": "string"
        }
    },
    "retrievalQuery": { 
        "text": "string"
    }
 }'
 ```
 ## Advanced - Use with Virtual Keys 
 Pre-requisites
 - [Setup proxy with DB](../proxy/virtual_keys.md#setup)
 Use this, to avoid giving developers the raw AWS Keys, but still letting them use AWS Bedrock endpoints.
 ### Usage
 1. Setup environment
 ```bash
 export DATABASE_URL=""
 export LITELLM_MASTER_KEY=""
 export AWS_ACCESS_KEY_ID=""  # Access key
 export AWS_SECRET_ACCESS_KEY="" # Secret access key
 export AWS_REGION_NAME="" # us-east-1, us-east-2, us-west-1, us-west-2
 ```
 ```bash
 litellm
 # RUNNING on http://0.0.0.0:4000
 ```
 2. Generate virtual key 
 ```bash
 curl -X POST 'http://0.0.0.0:4000/key/generate' \
 -H 'Authorization: Bearer sk-1234' \
 -H 'Content-Type: application/json' \
 -d '{}'
 ```
 Expected Response 
 ```bash
 {
    ...
    "key": "sk-1234ewknldferwedojwojw"
 }
 ```
 3. Test it! 
 ```bash
 curl -X POST 'http://0.0.0.0:4000/bedrock/model/cohere.command-r-v1:0/converse' \
 -H 'Authorization: Bearer sk-1234ewknldferwedojwojw' \
 -H 'Content-Type: application/json' \
 -d '{
    "messages": [
         {"role": "user",
        "content": [{"text": "Hello"}]
    }
    ]
 }'
 ```
--- a/docs/my-website/docs/pass_through/cohere.md
+++ b/docs/my-website/docs/pass_through/cohere.md
@ -0,0 +1,253 @@
 # Cohere API (Pass-Through)
 Pass-through endpoints for Cohere - call provider-specific endpoint, in native format (no translation).
 Just replace `https://api.cohere.com` with `LITELLM_PROXY_BASE_URL/cohere` 🚀
 #### **Example Usage**
 ```bash
 curl --request POST \
  --url http://0.0.0.0:4000/cohere/v1/chat \
  --header 'accept: application/json' \
  --header 'content-type: application/json' \
  --header "Authorization: bearer sk-anything" \
  --data '{
    "chat_history": [
      {"role": "USER", "message": "Who discovered gravity?"},
      {"role": "CHATBOT", "message": "The man who is widely credited with discovering gravity is Sir Isaac Newton"}
    ],
    "message": "What year was he born?",
    "connectors": [{"id": "web-search"}]
  }'
 ```
 Supports **ALL** Cohere Endpoints (including streaming).
 [**See All Cohere Endpoints**](https://docs.cohere.com/reference/chat)
 ## Quick Start
 Let's call the Cohere [`/rerank` endpoint](https://docs.cohere.com/reference/rerank)
 1. Add Cohere API Key to your environment 
 ```bash
 export COHERE_API_KEY=""
 ```
 2. Start LiteLLM Proxy 
 ```bash
 litellm
 # RUNNING on http://0.0.0.0:4000
 ```
 3. Test it! 
 Let's call the Cohere /rerank endpoint
 ```bash
 curl --request POST \
  --url http://0.0.0.0:4000/cohere/v1/rerank \
  --header 'accept: application/json' \
  --header 'content-type: application/json' \
  --header "Authorization: bearer sk-anything" \
  --data '{
    "model": "rerank-english-v3.0",
    "query": "What is the capital of the United States?",
    "top_n": 3,
    "documents": ["Carson City is the capital city of the American state of Nevada.",
                  "The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan.",
                  "Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district.",
                  "Capitalization or capitalisation in English grammar is the use of a capital letter at the start of a word. English usage varies from capitalization in other languages.",
                  "Capital punishment (the death penalty) has existed in the United States since beforethe United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states."]
  }'
 ```
 ## Examples
 Anything after `http://0.0.0.0:4000/cohere` is treated as a provider-specific route, and handled accordingly.
 Key Changes: 
 | **Original Endpoint**                                | **Replace With**                  |
 |------------------------------------------------------|-----------------------------------|
 | `https://api.cohere.com`          | `http://0.0.0.0:4000/cohere` (LITELLM_PROXY_BASE_URL="http://0.0.0.0:4000")      |
 | `bearer $CO_API_KEY`                                 | `bearer anything` (use `bearer LITELLM_VIRTUAL_KEY` if Virtual Keys are setup on proxy)                    |
 ### **Example 1: Rerank endpoint**
 #### LiteLLM Proxy Call 
 ```bash
 curl --request POST \
  --url http://0.0.0.0:4000/cohere/v1/rerank \
  --header 'accept: application/json' \
  --header 'content-type: application/json' \
  --header "Authorization: bearer sk-anything" \
  --data '{
    "model": "rerank-english-v3.0",
    "query": "What is the capital of the United States?",
    "top_n": 3,
    "documents": ["Carson City is the capital city of the American state of Nevada.",
                  "The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan.",
                  "Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district.",
                  "Capitalization or capitalisation in English grammar is the use of a capital letter at the start of a word. English usage varies from capitalization in other languages.",
                  "Capital punishment (the death penalty) has existed in the United States since beforethe United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states."]
  }'
 ```
 #### Direct Cohere API Call 
 ```bash
 curl --request POST \
  --url https://api.cohere.com/v1/rerank \
  --header 'accept: application/json' \
  --header 'content-type: application/json' \
  --header "Authorization: bearer $CO_API_KEY" \
  --data '{
    "model": "rerank-english-v3.0",
    "query": "What is the capital of the United States?",
    "top_n": 3,
    "documents": ["Carson City is the capital city of the American state of Nevada.",
                  "The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan.",
                  "Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district.",
                  "Capitalization or capitalisation in English grammar is the use of a capital letter at the start of a word. English usage varies from capitalization in other languages.",
                  "Capital punishment (the death penalty) has existed in the United States since beforethe United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states."]
  }'
 ```
 ### **Example 2: Chat API**
 #### LiteLLM Proxy Call 
 ```bash
 curl --request POST \
  --url http://0.0.0.0:4000/cohere/v1/chat \
  --header 'accept: application/json' \
  --header 'content-type: application/json' \
  --header "Authorization: bearer sk-anything" \
  --data '{
    "chat_history": [
      {"role": "USER", "message": "Who discovered gravity?"},
      {"role": "CHATBOT", "message": "The man who is widely credited with discovering gravity is Sir Isaac Newton"}
    ],
    "message": "What year was he born?",
    "connectors": [{"id": "web-search"}]
  }'
 ```
 #### Direct Cohere API Call 
 ```bash
 curl --request POST \
  --url https://api.cohere.com/v1/chat \
  --header 'accept: application/json' \
  --header 'content-type: application/json' \
  --header "Authorization: bearer $CO_API_KEY" \
  --data '{
    "chat_history": [
      {"role": "USER", "message": "Who discovered gravity?"},
      {"role": "CHATBOT", "message": "The man who is widely credited with discovering gravity is Sir Isaac Newton"}
    ],
    "message": "What year was he born?",
    "connectors": [{"id": "web-search"}]
  }'
 ```
 ### **Example 3: Embedding**
 ```bash
 curl --request POST \
  --url https://api.cohere.com/v1/embed \
  --header 'accept: application/json' \
  --header 'content-type: application/json' \
  --header "Authorization: bearer sk-anything" \
  --data '{
    "model": "embed-english-v3.0",
    "texts": ["hello", "goodbye"],
    "input_type": "classification"
  }'
 ```
 #### Direct Cohere API Call 
 ```bash
 curl --request POST \
  --url https://api.cohere.com/v1/embed \
  --header 'accept: application/json' \
  --header 'content-type: application/json' \
  --header "Authorization: bearer $CO_API_KEY" \
  --data '{
    "model": "embed-english-v3.0",
    "texts": ["hello", "goodbye"],
    "input_type": "classification"
  }'
 ```
 ## Advanced - Use with Virtual Keys 
 Pre-requisites
 - [Setup proxy with DB](../proxy/virtual_keys.md#setup)
 Use this, to avoid giving developers the raw Cohere API key, but still letting them use Cohere endpoints.
 ### Usage
 1. Setup environment
 ```bash
 export DATABASE_URL=""
 export LITELLM_MASTER_KEY=""
 export COHERE_API_KEY=""
 ```
 ```bash
 litellm
 # RUNNING on http://0.0.0.0:4000
 ```
 2. Generate virtual key 
 ```bash
 curl -X POST 'http://0.0.0.0:4000/key/generate' \
 -H 'Authorization: Bearer sk-1234' \
 -H 'Content-Type: application/json' \
 -d '{}'
 ```
 Expected Response 
 ```bash
 {
    ...
    "key": "sk-1234ewknldferwedojwojw"
 }
 ```
 3. Test it! 
 ```bash
 curl --request POST \
  --url http://0.0.0.0:4000/cohere/v1/rerank \
  --header 'accept: application/json' \
  --header 'content-type: application/json' \
  --header "Authorization: bearer sk-1234ewknldferwedojwojw" \
  --data '{
    "model": "rerank-english-v3.0",
    "query": "What is the capital of the United States?",
    "top_n": 3,
    "documents": ["Carson City is the capital city of the American state of Nevada.",
                  "The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan.",
                  "Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district.",
                  "Capitalization or capitalisation in English grammar is the use of a capital letter at the start of a word. English usage varies from capitalization in other languages.",
                  "Capital punishment (the death penalty) has existed in the United States since beforethe United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states."]
  }'
 ```
--- a/docs/my-website/docs/pass_through/google_ai_studio.md
+++ b/docs/my-website/docs/pass_through/google_ai_studio.md
@ -0,0 +1,223 @@
 # Google AI Studio (Pass-Through)
 Pass-through endpoints for Google AI Studio - call provider-specific endpoint, in native format (no translation).
 Just replace `https://generativelanguage.googleapis.com` with `LITELLM_PROXY_BASE_URL/gemini` 🚀
 #### **Example Usage**
 ```bash
 http://0.0.0.0:4000/gemini/v1beta/models/gemini-1.5-flash:countTokens?key=sk-anything' \
 -H 'Content-Type: application/json' \
 -d '{
    "contents": [{
        "parts":[{
          "text": "The quick brown fox jumps over the lazy dog."
          }]
        }]
 }'
 ```
 Supports **ALL** Google AI Studio Endpoints (including streaming).
 [**See All Google AI Studio Endpoints**](https://ai.google.dev/api)
 ## Quick Start
 Let's call the Gemini [`/countTokens` endpoint](https://ai.google.dev/api/tokens#method:-models.counttokens)
 1. Add Gemini API Key to your environment 
 ```bash
 export GEMINI_API_KEY=""
 ```
 2. Start LiteLLM Proxy 
 ```bash
 litellm
 # RUNNING on http://0.0.0.0:4000
 ```
 3. Test it! 
 Let's call the Google AI Studio token counting endpoint
 ```bash
 http://0.0.0.0:4000/gemini/v1beta/models/gemini-1.5-flash:countTokens?key=anything' \
 -H 'Content-Type: application/json' \
 -d '{
    "contents": [{
        "parts":[{
          "text": "The quick brown fox jumps over the lazy dog."
          }]
        }]
 }'
 ```
 ## Examples
 Anything after `http://0.0.0.0:4000/gemini` is treated as a provider-specific route, and handled accordingly.
 Key Changes: 
 | **Original Endpoint**                                | **Replace With**                  |
 |------------------------------------------------------|-----------------------------------|
 | `https://generativelanguage.googleapis.com`          | `http://0.0.0.0:4000/gemini` (LITELLM_PROXY_BASE_URL="http://0.0.0.0:4000")      |
 | `key=$GOOGLE_API_KEY`                                 | `key=anything` (use `key=LITELLM_VIRTUAL_KEY` if Virtual Keys are setup on proxy)                    |
 ### **Example 1: Counting tokens**
 #### LiteLLM Proxy Call 
 ```bash
 curl http://0.0.0.0:4000/gemini/v1beta/models/gemini-1.5-flash:countTokens?key=anything \
    -H 'Content-Type: application/json' \
    -X POST \
    -d '{
      "contents": [{
        "parts":[{
          "text": "The quick brown fox jumps over the lazy dog."
          }],
        }],
      }'
 ```
 #### Direct Google AI Studio Call 
 ```bash
 curl https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:countTokens?key=$GOOGLE_API_KEY \
    -H 'Content-Type: application/json' \
    -X POST \
    -d '{
      "contents": [{
        "parts":[{
          "text": "The quick brown fox jumps over the lazy dog."
          }],
        }],
      }'
 ```
 ### **Example 2: Generate content**
 #### LiteLLM Proxy Call 
 ```bash
 curl "http://0.0.0.0:4000/gemini/v1beta/models/gemini-1.5-flash:generateContent?key=anything" \
    -H 'Content-Type: application/json' \
    -X POST \
    -d '{
      "contents": [{
        "parts":[{"text": "Write a story about a magic backpack."}]
        }]
       }' 2> /dev/null
 ```
 #### Direct Google AI Studio Call 
 ```bash
 curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?key=$GOOGLE_API_KEY" \
    -H 'Content-Type: application/json' \
    -X POST \
    -d '{
      "contents": [{
        "parts":[{"text": "Write a story about a magic backpack."}]
        }]
       }' 2> /dev/null
 ```
 ### **Example 3: Caching**
 ```bash
 curl -X POST "http://0.0.0.0:4000/gemini/v1beta/models/gemini-1.5-flash-001:generateContent?key=anything" \
 -H 'Content-Type: application/json' \
 -d '{
      "contents": [
        {
          "parts":[{
            "text": "Please summarize this transcript"
          }],
          "role": "user"
        },
      ],
      "cachedContent": "'$CACHE_NAME'"
    }'
 ```
 #### Direct Google AI Studio Call 
 ```bash
 curl -X POST "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-001:generateContent?key=$GOOGLE_API_KEY" \
 -H 'Content-Type: application/json' \
 -d '{
      "contents": [
        {
          "parts":[{
            "text": "Please summarize this transcript"
          }],
          "role": "user"
        },
      ],
      "cachedContent": "'$CACHE_NAME'"
    }'
 ```
 ## Advanced - Use with Virtual Keys 
 Pre-requisites
 - [Setup proxy with DB](../proxy/virtual_keys.md#setup)
 Use this, to avoid giving developers the raw Google AI Studio key, but still letting them use Google AI Studio endpoints.
 ### Usage
 1. Setup environment
 ```bash
 export DATABASE_URL=""
 export LITELLM_MASTER_KEY=""
 export GEMINI_API_KEY=""
 ```
 ```bash
 litellm
 # RUNNING on http://0.0.0.0:4000
 ```
 2. Generate virtual key 
 ```bash
 curl -X POST 'http://0.0.0.0:4000/key/generate' \
 -H 'Authorization: Bearer sk-1234' \
 -H 'Content-Type: application/json' \
 -d '{}'
 ```
 Expected Response 
 ```bash
 {
    ...
    "key": "sk-1234ewknldferwedojwojw"
 }
 ```
 3. Test it! 
 ```bash
 http://0.0.0.0:4000/gemini/v1beta/models/gemini-1.5-flash:countTokens?key=sk-1234ewknldferwedojwojw' \
 -H 'Content-Type: application/json' \
 -d '{
    "contents": [{
        "parts":[{
          "text": "The quick brown fox jumps over the lazy dog."
          }]
        }]
 }'
 ```
--- a/docs/my-website/docs/pass_through/langfuse.md
+++ b/docs/my-website/docs/pass_through/langfuse.md
@ -0,0 +1,132 @@
 # Langfuse Endpoints (Pass-Through)
 Pass-through endpoints for Langfuse - call langfuse endpoints with LiteLLM Virtual Key.
 Just replace `https://us.cloud.langfuse.com` with `LITELLM_PROXY_BASE_URL/langfuse` 🚀
 #### **Example Usage**
 ```python
 from langfuse import Langfuse
 langfuse = Langfuse(
    host="http://localhost:4000/langfuse", # your litellm proxy endpoint
    public_key="anything",        # no key required since this is a pass through
    secret_key="LITELLM_VIRTUAL_KEY",        # no key required since this is a pass through
 )
 print("sending langfuse trace request")
 trace = langfuse.trace(name="test-trace-litellm-proxy-passthrough")
 print("flushing langfuse request")
 langfuse.flush()
 print("flushed langfuse request")
 ```
 Supports **ALL** Langfuse Endpoints.
 [**See All Langfuse Endpoints**](https://api.reference.langfuse.com/)
 ## Quick Start
 Let's log a trace to Langfuse.
 1. Add Langfuse Public/Private keys to environment
 ```bash
 export LANGFUSE_PUBLIC_KEY=""
 export LANGFUSE_PRIVATE_KEY=""
 ```
 2. Start LiteLLM Proxy 
 ```bash
 litellm
 # RUNNING on http://0.0.0.0:4000
 ```
 3. Test it! 
 Let's log a trace to Langfuse! 
 ```python
 from langfuse import Langfuse
 langfuse = Langfuse(
    host="http://localhost:4000/langfuse", # your litellm proxy endpoint
    public_key="anything",        # no key required since this is a pass through
    secret_key="anything",        # no key required since this is a pass through
 )
 print("sending langfuse trace request")
 trace = langfuse.trace(name="test-trace-litellm-proxy-passthrough")
 print("flushing langfuse request")
 langfuse.flush()
 print("flushed langfuse request")
 ```
 ## Advanced - Use with Virtual Keys 
 Pre-requisites
 - [Setup proxy with DB](../proxy/virtual_keys.md#setup)
 Use this, to avoid giving developers the raw Google AI Studio key, but still letting them use Google AI Studio endpoints.
 ### Usage
 1. Setup environment
 ```bash
 export DATABASE_URL=""
 export LITELLM_MASTER_KEY=""
 export LANGFUSE_PUBLIC_KEY=""
 export LANGFUSE_PRIVATE_KEY=""
 ```
 ```bash
 litellm
 # RUNNING on http://0.0.0.0:4000
 ```
 2. Generate virtual key 
 ```bash
 curl -X POST 'http://0.0.0.0:4000/key/generate' \
 -H 'Authorization: Bearer sk-1234' \
 -H 'Content-Type: application/json' \
 -d '{}'
 ```
 Expected Response 
 ```bash
 {
    ...
    "key": "sk-1234ewknldferwedojwojw"
 }
 ```
 3. Test it! 
 ```python
 from langfuse import Langfuse
 langfuse = Langfuse(
    host="http://localhost:4000/langfuse", # your litellm proxy endpoint
    public_key="anything",        # no key required since this is a pass through
    secret_key="sk-1234ewknldferwedojwojw",        # no key required since this is a pass through
 )
 print("sending langfuse trace request")
 trace = langfuse.trace(name="test-trace-litellm-proxy-passthrough")
 print("flushing langfuse request")
 langfuse.flush()
 print("flushed langfuse request")
 ```
 ## [Advanced - Log to separate langfuse projects (by key/team)](../proxy/team_logging.md)
--- a/docs/my-website/docs/pass_through/vertex_ai.md
+++ b/docs/my-website/docs/pass_through/vertex_ai.md
@ -0,0 +1,101 @@
 # [BETA] Vertex AI Endpoints (Pass-Through)
 Pass-through endpoints for Vertex AI - call provider-specific endpoint, in native format (no translation).
 :::tip
 Looking for the Unified API (OpenAI format) for VertexAI ? [Go here - using vertexAI with LiteLLM SDK or LiteLLM Proxy Server](../docs/providers/vertex.md)
 :::
 ## Supported API Endpoints
 - Gemini API
 - Embeddings API
 - Imagen API
 - Code Completion API
 - Batch prediction API
 - Tuning API
 - CountTokens API
 ## Quick Start Usage 
 #### 1. Set `default_vertex_config` on your `config.yaml`
 Add the following credentials to your litellm config.yaml to use the Vertex AI endpoints.
 ```yaml
 default_vertex_config:
  vertex_project: "adroit-crow-413218"
  vertex_location: "us-central1"
  vertex_credentials: "/Users/ishaanjaffer/Downloads/adroit-crow-413218-a956eef1a2a8.json" # Add path to service account.json
 ```
 #### 2. Start litellm proxy
 ```shell
 litellm --config /path/to/config.yaml
 ```
 #### 3. Test it 
 ```shell
 curl http://localhost:4000/vertex-ai/publishers/google/models/textembedding-gecko@001:countTokens \
 -H "Content-Type: application/json" \
 -H "Authorization: Bearer sk-1234" \
 -d '{"instances":[{"content": "gm"}]}'
 ```
 ## Usage Examples
 ### Gemini API (Generate Content)
 ```shell
 curl http://localhost:4000/vertex-ai/publishers/google/models/gemini-1.5-flash-001:generateContent \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer sk-1234" \
  -d '{"contents":[{"role": "user", "parts":[{"text": "hi"}]}]}'
 ```
 ### Embeddings API
 ```shell
 curl http://localhost:4000/vertex-ai/publishers/google/models/textembedding-gecko@001:predict \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer sk-1234" \
  -d '{"instances":[{"content": "gm"}]}'
 ```
 ### Imagen API
 ```shell
 curl http://localhost:4000/vertex-ai/publishers/google/models/imagen-3.0-generate-001:predict \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer sk-1234" \
  -d '{"instances":[{"prompt": "make an otter"}], "parameters": {"sampleCount": 1}}'
 ```
 ### Count Tokens API
 ```shell
 curl http://localhost:4000/vertex-ai/publishers/google/models/gemini-1.5-flash-001:countTokens \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer sk-1234" \
  -d '{"contents":[{"role": "user", "parts":[{"text": "hi"}]}]}'
 ```
 ### Tuning API 
 Create Fine Tuning Job
 ```shell
 curl http://localhost:4000/vertex-ai/tuningJobs \
      -H "Content-Type: application/json" \
      -H "Authorization: Bearer sk-1234" \
      -d '{
  "baseModel": "gemini-1.0-pro-002",
  "supervisedTuningSpec" : {
      "training_dataset_uri": "gs://cloud-samples-data/ai-platform/generative_ai/sft_train_data.jsonl"
  }
 }'
 ```
--- a/docs/my-website/docs/providers/anthropic.md
+++ b/docs/my-website/docs/providers/anthropic.md
@ -22,6 +22,7 @@ Anthropic API fails requests when `max_tokens` are not passed. Due to this litel
 import os
 os.environ["ANTHROPIC_API_KEY"] = "your-api-key"
 # os.environ["ANTHROPIC_API_BASE"] = "" # [OPTIONAL] or 'ANTHROPIC_BASE_URL'
 ```
 ## Usage
@ -55,7 +56,7 @@ for chunk in response:
    print(chunk["choices"][0]["delta"]["content"])  # same as openai format
 ```
-## OpenAI Proxy Usage 
+## Usage with LiteLLM Proxy 
 Here's how to call Anthropic with the LiteLLM Proxy Server
@ -68,14 +69,6 @@ export ANTHROPIC_API_KEY="your-api-key"
 ### 2. Start the proxy 
 <Tabs>
 <TabItem value="cli" label="cli">
 ```bash
 $ litellm --model claude-3-opus-20240229
 # Server running on http://0.0.0.0:4000
 ```
 </TabItem>
 <TabItem value="config" label="config.yaml">
 ```yaml
@ -90,6 +83,55 @@ model_list:
 litellm --config /path/to/config.yaml
 ```
 </TabItem>
 <TabItem value="config-all" label="config - default all Anthropic Model">
 Use this if you want to make requests to `claude-3-haiku-20240307`,`claude-3-opus-20240229`,`claude-2.1` without defining them on the config.yaml
 #### Required env variables
 ```
 ANTHROPIC_API_KEY=sk-ant****
 ```
 ```yaml
 model_list:
  - model_name: "*" 
    litellm_params:
      model: "*"
 ```
 ```bash
 litellm --config /path/to/config.yaml
 ```
 Example Request for this config.yaml
 **Ensure you use `anthropic/` prefix to route the request to Anthropic API**
 ```shell
 curl --location 'http://0.0.0.0:4000/chat/completions' \
 --header 'Content-Type: application/json' \
 --data ' {
      "model": "anthropic/claude-3-haiku-20240307",
      "messages": [
        {
          "role": "user",
          "content": "what llm are you"
        }
      ]
    }
 '
 ```
 </TabItem>
 <TabItem value="cli" label="cli">
 ```bash
 $ litellm --model claude-3-opus-20240229
 # Server running on http://0.0.0.0:4000
 ```
 </TabItem>
 </Tabs>
 ### 3. Test it
@ -183,9 +225,336 @@ print(response)
 | claude-instant-1.2  | `completion('claude-instant-1.2', messages)` | `os.environ['ANTHROPIC_API_KEY']`       |
 | claude-instant-1  | `completion('claude-instant-1', messages)` | `os.environ['ANTHROPIC_API_KEY']`       |
-## Advanced
+## **Prompt Caching**
-## Usage - Function Calling 
+Use Anthropic Prompt Caching
 [Relevant Anthropic API Docs](https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching)
 ### Caching - Large Context Caching 
 This example demonstrates basic Prompt Caching usage, caching the full text of the legal agreement as a prefix while keeping the user instruction uncached.
 <Tabs>
 <TabItem value="sdk" label="LiteLLM SDK">
 ```python 
 response = await litellm.acompletion(
    model="anthropic/claude-3-5-sonnet-20240620",
    messages=[
        {
            "role": "system",
            "content": [
                {
                    "type": "text",
                    "text": "You are an AI assistant tasked with analyzing legal documents.",
                },
                {
                    "type": "text",
                    "text": "Here is the full text of a complex legal agreement",
                    "cache_control": {"type": "ephemeral"},
                },
            ],
        },
        {
            "role": "user",
            "content": "what are the key terms and conditions in this agreement?",
        },
    ],
    extra_headers={
        "anthropic-version": "2023-06-01",
        "anthropic-beta": "prompt-caching-2024-07-31",
    },
 )
 ```
 </TabItem>
 <TabItem value="proxy" label="LiteLLM Proxy">
 :::info
 LiteLLM Proxy is OpenAI compatible
 This is an example using the OpenAI Python SDK sending a request to LiteLLM Proxy
 Assuming you have a model=`anthropic/claude-3-5-sonnet-20240620` on the [litellm proxy config.yaml](#usage-with-litellm-proxy)
 :::
 ```python 
 import openai
 client = openai.AsyncOpenAI(
    api_key="anything",            # litellm proxy api key
    base_url="http://0.0.0.0:4000" # litellm proxy base url
 )
 response = await client.chat.completions.create(
    model="anthropic/claude-3-5-sonnet-20240620",
    messages=[
        {
            "role": "system",
            "content": [
                {
                    "type": "text",
                    "text": "You are an AI assistant tasked with analyzing legal documents.",
                },
                {
                    "type": "text",
                    "text": "Here is the full text of a complex legal agreement",
                    "cache_control": {"type": "ephemeral"},
                },
            ],
        },
        {
            "role": "user",
            "content": "what are the key terms and conditions in this agreement?",
        },
    ],
    extra_headers={
        "anthropic-version": "2023-06-01",
        "anthropic-beta": "prompt-caching-2024-07-31",
    },
 )
 ```
 </TabItem>
 </Tabs>
 ### Caching - Tools definitions
 In this example, we demonstrate caching tool definitions.
 The cache_control parameter is placed on the final tool
 <Tabs>
 <TabItem value="sdk" label="LiteLLM SDK">
 ```python 
 import litellm
 response = await litellm.acompletion(
    model="anthropic/claude-3-5-sonnet-20240620",
    messages = [{"role": "user", "content": "What's the weather like in Boston today?"}]
    tools = [
        {
            "type": "function",
            "function": {
                "name": "get_current_weather",
                "description": "Get the current weather in a given location",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "location": {
                            "type": "string",
                            "description": "The city and state, e.g. San Francisco, CA",
                        },
                        "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
                    },
                    "required": ["location"],
                },
                "cache_control": {"type": "ephemeral"}
            },
        }
    ],
    extra_headers={
        "anthropic-version": "2023-06-01",
        "anthropic-beta": "prompt-caching-2024-07-31",
    },
 )
 ```
 </TabItem>
 <TabItem value="proxy" label="LiteLLM Proxy">
 :::info
 LiteLLM Proxy is OpenAI compatible
 This is an example using the OpenAI Python SDK sending a request to LiteLLM Proxy
 Assuming you have a model=`anthropic/claude-3-5-sonnet-20240620` on the [litellm proxy config.yaml](#usage-with-litellm-proxy)
 :::
 ```python 
 import openai
 client = openai.AsyncOpenAI(
    api_key="anything",            # litellm proxy api key
    base_url="http://0.0.0.0:4000" # litellm proxy base url
 )
 response = await client.chat.completions.create(
    model="anthropic/claude-3-5-sonnet-20240620",
    messages = [{"role": "user", "content": "What's the weather like in Boston today?"}]
    tools = [
        {
            "type": "function",
            "function": {
                "name": "get_current_weather",
                "description": "Get the current weather in a given location",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "location": {
                            "type": "string",
                            "description": "The city and state, e.g. San Francisco, CA",
                        },
                        "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
                    },
                    "required": ["location"],
                },
                "cache_control": {"type": "ephemeral"}
            },
        }
    ],
    extra_headers={
        "anthropic-version": "2023-06-01",
        "anthropic-beta": "prompt-caching-2024-07-31",
    },
 )
 ```
 </TabItem>
 </Tabs>
 ### Caching - Continuing Multi-Turn Convo
 In this example, we demonstrate how to use Prompt Caching in a multi-turn conversation.
 The cache_control parameter is placed on the system message to designate it as part of the static prefix.
 The conversation history (previous messages) is included in the messages array. The final turn is marked with cache-control, for continuing in followups. The second-to-last user message is marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache.
 <Tabs>
 <TabItem value="sdk" label="LiteLLM SDK">
 ```python 
 import litellm
 response = await litellm.acompletion(
    model="anthropic/claude-3-5-sonnet-20240620",
    messages=[
        # System Message
        {
            "role": "system",
            "content": [
                {
                    "type": "text",
                    "text": "Here is the full text of a complex legal agreement"
                    * 400,
                    "cache_control": {"type": "ephemeral"},
                }
            ],
        },
        # marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache.
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "What are the key terms and conditions in this agreement?",
                    "cache_control": {"type": "ephemeral"},
                }
            ],
        },
        {
            "role": "assistant",
            "content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
        },
        # The final turn is marked with cache-control, for continuing in followups.
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "What are the key terms and conditions in this agreement?",
                    "cache_control": {"type": "ephemeral"},
                }
            ],
        },
    ],
    extra_headers={
        "anthropic-version": "2023-06-01",
        "anthropic-beta": "prompt-caching-2024-07-31",
    },
 )
 ```
 </TabItem>
 <TabItem value="proxy" label="LiteLLM Proxy">
 :::info
 LiteLLM Proxy is OpenAI compatible
 This is an example using the OpenAI Python SDK sending a request to LiteLLM Proxy
 Assuming you have a model=`anthropic/claude-3-5-sonnet-20240620` on the [litellm proxy config.yaml](#usage-with-litellm-proxy)
 :::
 ```python 
 import openai
 client = openai.AsyncOpenAI(
    api_key="anything",            # litellm proxy api key
    base_url="http://0.0.0.0:4000" # litellm proxy base url
 )
 response = await client.chat.completions.create(
    model="anthropic/claude-3-5-sonnet-20240620",
    messages=[
        # System Message
        {
            "role": "system",
            "content": [
                {
                    "type": "text",
                    "text": "Here is the full text of a complex legal agreement"
                    * 400,
                    "cache_control": {"type": "ephemeral"},
                }
            ],
        },
        # marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache.
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "What are the key terms and conditions in this agreement?",
                    "cache_control": {"type": "ephemeral"},
                }
            ],
        },
        {
            "role": "assistant",
            "content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
        },
        # The final turn is marked with cache-control, for continuing in followups.
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "What are the key terms and conditions in this agreement?",
                    "cache_control": {"type": "ephemeral"},
                }
            ],
        },
    ],
    extra_headers={
        "anthropic-version": "2023-06-01",
        "anthropic-beta": "prompt-caching-2024-07-31",
    },
 )
 ```
 </TabItem>
 </Tabs>
 ## **Function/Tool Calling**
 :::info 
@ -374,6 +743,20 @@ resp = litellm.completion(
 print(f"\nResponse: {resp}")
 ```
 ## **Passing Extra Headers to Anthropic API**
 Pass `extra_headers: dict` to `litellm.completion`
 ```python
 from litellm import completion
 messages = [{"role": "user", "content": "What is Anthropic?"}]
 response = completion(
    model="claude-3-5-sonnet-20240620", 
    messages=messages, 
    extra_headers={"anthropic-beta": "max-tokens-3-5-sonnet-2024-07-15"}
 )
 ```
 ## Usage - "Assistant Pre-fill"
 You can "put words in Claude's mouth" by including an `assistant` role message as the last item in the `messages` array.
--- a/docs/my-website/docs/providers/aws_sagemaker.md
+++ b/docs/my-website/docs/providers/aws_sagemaker.md
@ -1,10 +1,18 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem'
 # AWS Sagemaker
 LiteLLM supports All Sagemaker Huggingface Jumpstart Models
 :::tip
 **We support ALL Sagemaker models, just set `model=sagemaker/<any-model-on-sagemaker>` as a prefix when sending litellm requests**
 :::
 ### API KEYS
 ```python
 !pip install boto3 
 os.environ["AWS_ACCESS_KEY_ID"] = ""
 os.environ["AWS_SECRET_ACCESS_KEY"] = ""
 os.environ["AWS_REGION_NAME"] = ""
@ -27,6 +35,327 @@ response = completion(
        )
 ```
 ### Usage - Streaming
 Sagemaker currently does not support streaming - LiteLLM fakes streaming by returning chunks of the response string
 ```python
 import os 
 from litellm import completion
 os.environ["AWS_ACCESS_KEY_ID"] = ""
 os.environ["AWS_SECRET_ACCESS_KEY"] = ""
 os.environ["AWS_REGION_NAME"] = ""
 response = completion(
            model="sagemaker/jumpstart-dft-meta-textgeneration-llama-2-7b", 
            messages=[{ "content": "Hello, how are you?","role": "user"}],
            temperature=0.2,
            max_tokens=80,
            stream=True,
        )
 for chunk in response:
    print(chunk)
 ```
 ## **LiteLLM Proxy Usage**
 Here's how to call Sagemaker with the LiteLLM Proxy Server
 ### 1. Setup config.yaml
 ```yaml
 model_list:
  - model_name: jumpstart-model
    litellm_params:
      model: sagemaker/jumpstart-dft-hf-textgeneration1-mp-20240815-185614
      aws_access_key_id: os.environ/CUSTOM_AWS_ACCESS_KEY_ID
      aws_secret_access_key: os.environ/CUSTOM_AWS_SECRET_ACCESS_KEY
      aws_region_name: os.environ/CUSTOM_AWS_REGION_NAME
 ```
 All possible auth params: 
 ```
 aws_access_key_id: Optional[str],
 aws_secret_access_key: Optional[str],
 aws_session_token: Optional[str],
 aws_region_name: Optional[str],
 aws_session_name: Optional[str],
 aws_profile_name: Optional[str],
 aws_role_name: Optional[str],
 aws_web_identity_token: Optional[str],
 ```
 ### 2. Start the proxy 
 ```bash
 litellm --config /path/to/config.yaml
 ```
 ### 3. Test it
 <Tabs>
 <TabItem value="Curl" label="Curl Request">
 ```shell
 curl --location 'http://0.0.0.0:4000/chat/completions' \
 --header 'Content-Type: application/json' \
 --data ' {
      "model": "jumpstart-model",
      "messages": [
        {
          "role": "user",
          "content": "what llm are you"
        }
      ]
    }
 '
 ```
 </TabItem>
 <TabItem value="openai" label="OpenAI v1.0.0+">
 ```python
 import openai
 client = openai.OpenAI(
    api_key="anything",
    base_url="http://0.0.0.0:4000"
 )
 response = client.chat.completions.create(model="jumpstart-model", messages = [
    {
        "role": "user",
        "content": "this is a test request, write a short poem"
    }
 ])
 print(response)
 ```
 </TabItem>
 <TabItem value="langchain" label="Langchain">
 ```python
 from langchain.chat_models import ChatOpenAI
 from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
 )
 from langchain.schema import HumanMessage, SystemMessage
 chat = ChatOpenAI(
    openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
    model = "jumpstart-model",
    temperature=0.1
 )
 messages = [
    SystemMessage(
        content="You are a helpful assistant that im using to make a test request to."
    ),
    HumanMessage(
        content="test from litellm. tell me why it's amazing in 1 sentence"
    ),
 ]
 response = chat(messages)
 print(response)
 ```
 </TabItem>
 </Tabs>
 ## Set temperature, top p, etc.
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 import os
 from litellm import completion
 os.environ["AWS_ACCESS_KEY_ID"] = ""
 os.environ["AWS_SECRET_ACCESS_KEY"] = ""
 os.environ["AWS_REGION_NAME"] = ""
 response = completion(
  model="sagemaker/jumpstart-dft-hf-textgeneration1-mp-20240815-185614",
  messages=[{ "content": "Hello, how are you?","role": "user"}],
  temperature=0.7,
  top_p=1
 )
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 **Set on yaml**
 ```yaml
 model_list:
  - model_name: jumpstart-model
    litellm_params:
      model: sagemaker/jumpstart-dft-hf-textgeneration1-mp-20240815-185614
      temperature: <your-temp>
      top_p: <your-top-p>
 ```
 **Set on request**
 ```python
 import openai
 client = openai.OpenAI(
    api_key="anything",
    base_url="http://0.0.0.0:4000"
 )
 # request sent to model set on litellm proxy, `litellm --model`
 response = client.chat.completions.create(model="jumpstart-model", messages = [
    {
        "role": "user",
        "content": "this is a test request, write a short poem"
    }
 ],
 temperature=0.7,
 top_p=1
 )
 print(response)
 ```
 </TabItem>
 </Tabs>
 ## **Allow setting temperature=0** for Sagemaker
 By default when `temperature=0` is sent in requests to LiteLLM, LiteLLM rounds up to `temperature=0.1` since Sagemaker fails most requests when `temperature=0`
 If you want to send `temperature=0` for your model here's how to set it up (Since Sagemaker can host any kind of model, some models allow zero temperature)
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 import os
 from litellm import completion
 os.environ["AWS_ACCESS_KEY_ID"] = ""
 os.environ["AWS_SECRET_ACCESS_KEY"] = ""
 os.environ["AWS_REGION_NAME"] = ""
 response = completion(
  model="sagemaker/jumpstart-dft-hf-textgeneration1-mp-20240815-185614",
  messages=[{ "content": "Hello, how are you?","role": "user"}],
  temperature=0,
  aws_sagemaker_allow_zero_temp=True,
 )
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 **Set `aws_sagemaker_allow_zero_temp` on yaml**
 ```yaml
 model_list:
  - model_name: jumpstart-model
    litellm_params:
      model: sagemaker/jumpstart-dft-hf-textgeneration1-mp-20240815-185614
      aws_sagemaker_allow_zero_temp: true
 ```
 **Set `temperature=0` on request**
 ```python
 import openai
 client = openai.OpenAI(
    api_key="anything",
    base_url="http://0.0.0.0:4000"
 )
 # request sent to model set on litellm proxy, `litellm --model`
 response = client.chat.completions.create(model="jumpstart-model", messages = [
    {
        "role": "user",
        "content": "this is a test request, write a short poem"
    }
 ],
 temperature=0,
 )
 print(response)
 ```
 </TabItem>
 </Tabs>
 ## Pass provider-specific params 
 If you pass a non-openai param to litellm, we'll assume it's provider-specific and send it as a kwarg in the request body. [See more](../completion/input.md#provider-specific-params)
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 import os
 from litellm import completion
 os.environ["AWS_ACCESS_KEY_ID"] = ""
 os.environ["AWS_SECRET_ACCESS_KEY"] = ""
 os.environ["AWS_REGION_NAME"] = ""
 response = completion(
  model="sagemaker/jumpstart-dft-hf-textgeneration1-mp-20240815-185614",
  messages=[{ "content": "Hello, how are you?","role": "user"}],
  top_k=1 # 👈 PROVIDER-SPECIFIC PARAM
 )
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 **Set on yaml**
 ```yaml
 model_list:
  - model_name: jumpstart-model
    litellm_params:
      model: sagemaker/jumpstart-dft-hf-textgeneration1-mp-20240815-185614
      top_k: 1 # 👈 PROVIDER-SPECIFIC PARAM
 ```
 **Set on request**
 ```python
 import openai
 client = openai.OpenAI(
    api_key="anything",
    base_url="http://0.0.0.0:4000"
 )
 # request sent to model set on litellm proxy, `litellm --model`
 response = client.chat.completions.create(model="jumpstart-model", messages = [
    {
        "role": "user",
        "content": "this is a test request, write a short poem"
    }
 ],
 temperature=0.7,
 extra_body={
    top_k=1 # 👈 PROVIDER-SPECIFIC PARAM
 }
 )
 print(response)
 ```
 </TabItem>
 </Tabs>
 ### Passing Inference Component Name
 If you have multiple models on an endpoint, you'll need to specify the individual model names, do this via `model_id`.  
@ -85,29 +414,16 @@ response = completion(
 You can also pass in your own [custom prompt template](../completion/prompt_formatting.md#format-prompt-yourself)
 ### Usage - Streaming
 Sagemaker currently does not support streaming - LiteLLM fakes streaming by returning chunks of the response string
 ```python
 import os 
 from litellm import completion
 os.environ["AWS_ACCESS_KEY_ID"] = ""
 os.environ["AWS_SECRET_ACCESS_KEY"] = ""
 os.environ["AWS_REGION_NAME"] = ""
 response = completion(
            model="sagemaker/jumpstart-dft-meta-textgeneration-llama-2-7b", 
            messages=[{ "content": "Hello, how are you?","role": "user"}],
            temperature=0.2,
            max_tokens=80,
            stream=True,
        )
 for chunk in response:
    print(chunk)
 ```
 ### Completion Models 
 :::tip
 **We support ALL Sagemaker models, just set `model=sagemaker/<any-model-on-sagemaker>` as a prefix when sending litellm requests**
 :::
 Here's an example of using a sagemaker model with LiteLLM 
 | Model Name                    | Function Call                                                                                       |
@ -120,7 +436,7 @@ Here's an example of using a sagemaker model with LiteLLM
 | Meta Llama 2 70B              | `completion(model='sagemaker/jumpstart-dft-meta-textgeneration-llama-2-70b', messages=messages)`       | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']`              |
 | Meta Llama 2 70B (Chat/Fine-tuned) | `completion(model='sagemaker/jumpstart-dft-meta-textgeneration-llama-2-70b-b-f', messages=messages)`   | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']`              |
-### Embedding Models
+## Embedding Models
 LiteLLM supports all Sagemaker Jumpstart Huggingface Embedding models. Here's how to call it: 
--- a/docs/my-website/docs/providers/azure.md
+++ b/docs/my-website/docs/providers/azure.md
@ -66,8 +66,15 @@ response = litellm.completion(
 ## Azure OpenAI Chat Completion Models
 :::tip
 **We support ALL Azure models, just set `model=azure/<your deployment name>` as a prefix when sending litellm requests**
 :::
 | Model Name       | Function Call                          |
 |------------------|----------------------------------------|
 | gpt-4o-mini            | `completion('azure/<your deployment name>', messages)`         |
 | gpt-4o            | `completion('azure/<your deployment name>', messages)`         |
 | gpt-4            | `completion('azure/<your deployment name>', messages)`         |
 | gpt-4-0314            | `completion('azure/<your deployment name>', messages)`         | 
--- a/docs/my-website/docs/providers/bedrock.md
+++ b/docs/my-website/docs/providers/bedrock.md
@ -36,40 +36,40 @@ response = completion(
 )
 ```
-## OpenAI Proxy Usage 
+## LiteLLM Proxy Usage 
 Here's how to call Anthropic with the LiteLLM Proxy Server
-### 1. Save key in your environment
+### 1. Setup config.yaml
 ```bash
 export AWS_ACCESS_KEY_ID=""
 export AWS_SECRET_ACCESS_KEY=""
 export AWS_REGION_NAME=""
 ```
 ### 2. Start the proxy 
 <Tabs>
 <TabItem value="cli" label="CLI">
 ```bash
 $ litellm --model anthropic.claude-3-sonnet-20240229-v1:0
 # Server running on http://0.0.0.0:4000
 ```
 </TabItem>
 <TabItem value="config" label="config.yaml">
 ```yaml
 model_list:
  - model_name: bedrock-claude-v1
    litellm_params:
      model: bedrock/anthropic.claude-instant-v1
      aws_access_key_id: os.environ/CUSTOM_AWS_ACCESS_KEY_ID
      aws_secret_access_key: os.environ/CUSTOM_AWS_SECRET_ACCESS_KEY
      aws_region_name: os.environ/CUSTOM_AWS_REGION_NAME
 ```
 </TabItem>
 </Tabs>
 All possible auth params: 
 ```
 aws_access_key_id: Optional[str],
 aws_secret_access_key: Optional[str],
 aws_session_token: Optional[str],
 aws_region_name: Optional[str],
 aws_session_name: Optional[str],
 aws_profile_name: Optional[str],
 aws_role_name: Optional[str],
 aws_web_identity_token: Optional[str],
 ```
 ### 2. Start the proxy 
 ```bash
 litellm --config /path/to/config.yaml
 ```
 ### 3. Test it
@ -360,6 +360,120 @@ resp = litellm.completion(
 print(f"\nResponse: {resp}")
 ```
 ## Usage - Bedrock Guardrails
 Example of using [Bedrock Guardrails with LiteLLM](https://docs.aws.amazon.com/bedrock/latest/userguide/guardrails-use-converse-api.html)
 <Tabs>
 <TabItem value="sdk" label="LiteLLM SDK">
 ```python
 from litellm import completion
 # set env
 os.environ["AWS_ACCESS_KEY_ID"] = ""
 os.environ["AWS_SECRET_ACCESS_KEY"] = ""
 os.environ["AWS_REGION_NAME"] = ""
 response = completion(
    model="anthropic.claude-v2",
    messages=[
        {
            "content": "where do i buy coffee from? ",
            "role": "user",
        }
    ],
    max_tokens=10,
    guardrailConfig={
        "guardrailIdentifier": "ff6ujrregl1q", # The identifier (ID) for the guardrail.
        "guardrailVersion": "DRAFT",           # The version of the guardrail.
        "trace": "disabled",                   # The trace behavior for the guardrail. Can either be "disabled" or "enabled"
    },
 )
 ```
 </TabItem>
 <TabItem value="proxy" label="Proxy on request">
 ```python
 import openai
 client = openai.OpenAI(
    api_key="anything",
    base_url="http://0.0.0.0:4000"
 )
 # request sent to model set on litellm proxy, `litellm --model`
 response = client.chat.completions.create(model="anthropic.claude-v2", messages = [
    {
        "role": "user",
        "content": "this is a test request, write a short poem"
    }
 ],
 temperature=0.7,
 extra_body={
    "guardrailConfig": {
        "guardrailIdentifier": "ff6ujrregl1q", # The identifier (ID) for the guardrail.
        "guardrailVersion": "DRAFT",           # The version of the guardrail.
        "trace": "disabled",                   # The trace behavior for the guardrail. Can either be "disabled" or "enabled"
    },
 }
 )
 print(response)
 ```
 </TabItem>
 <TabItem value="proxy-config" label="Proxy on config.yaml">
 1. Update config.yaml 
 ```yaml
 model_list:
  - model_name: bedrock-claude-v1
    litellm_params:
      model: bedrock/anthropic.claude-instant-v1
      aws_access_key_id: os.environ/CUSTOM_AWS_ACCESS_KEY_ID
      aws_secret_access_key: os.environ/CUSTOM_AWS_SECRET_ACCESS_KEY
      aws_region_name: os.environ/CUSTOM_AWS_REGION_NAME
      guardrailConfig: {
        "guardrailIdentifier": "ff6ujrregl1q", # The identifier (ID) for the guardrail.
        "guardrailVersion": "DRAFT",           # The version of the guardrail.
        "trace": "disabled",                   # The trace behavior for the guardrail. Can either be "disabled" or "enabled"
    }
 ```
 2. Start proxy 
 ```bash
 litellm --config /path/to/config.yaml
 ```
 3. Test it! 
 ```python
 import openai
 client = openai.OpenAI(
    api_key="anything",
    base_url="http://0.0.0.0:4000"
 )
 # request sent to model set on litellm proxy, `litellm --model`
 response = client.chat.completions.create(model="bedrock-claude-v1", messages = [
    {
        "role": "user",
        "content": "this is a test request, write a short poem"
    }
 ],
 temperature=0.7
 )
 print(response)
 ```
 </TabItem>
 </Tabs>
 ## Usage - "Assistant Pre-fill"
 If you're using Anthropic's Claude with Bedrock, you can "put words in Claude's mouth" by including an `assistant` role message as the last item in the `messages` array.
@ -623,7 +737,7 @@ response = litellm.embedding(
 ## Supported AWS Bedrock Models
-Here's an example of using a bedrock model with LiteLLM
+Here's an example of using a bedrock model with LiteLLM. For a complete list, refer to the [model cost map](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)
 | Model Name                 | Command                                                          |
 |----------------------------|------------------------------------------------------------------|
@ -641,6 +755,7 @@ Here's an example of using a bedrock model with LiteLLM
 | Cohere Command             | `completion(model='bedrock/cohere.command-text-v14', messages=messages)`   | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` |
 | AI21 J2-Mid                | `completion(model='bedrock/ai21.j2-mid-v1', messages=messages)`   | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` |
 | AI21 J2-Ultra              | `completion(model='bedrock/ai21.j2-ultra-v1', messages=messages)`   | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` |
 | AI21 Jamba-Instruct              | `completion(model='bedrock/ai21.jamba-instruct-v1:0', messages=messages)`   | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` |
 | Meta Llama 2 Chat 13b      | `completion(model='bedrock/meta.llama2-13b-chat-v1', messages=messages)`   | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` |
 | Meta Llama 2 Chat 70b      | `completion(model='bedrock/meta.llama2-70b-chat-v1', messages=messages)`   | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` |
 | Mistral 7B Instruct        | `completion(model='bedrock/mistral.mistral-7b-instruct-v0:2', messages=messages)`   | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` |
--- a/docs/my-website/docs/providers/custom_llm_server.md
+++ b/docs/my-website/docs/providers/custom_llm_server.md
@ -0,0 +1,168 @@
 # Custom API Server (Custom Format)
 Call your custom torch-serve / internal LLM APIs via LiteLLM
 :::info
 - For calling an openai-compatible endpoint, [go here](./openai_compatible.md)
 - For modifying incoming/outgoing calls on proxy, [go here](../proxy/call_hooks.md)
 :::
 ## Quick Start 
 ```python
 import litellm
 from litellm import CustomLLM, completion, get_llm_provider
 class MyCustomLLM(CustomLLM):
    def completion(self, *args, **kwargs) -> litellm.ModelResponse:
        return litellm.completion(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": "Hello world"}],
            mock_response="Hi!",
        )  # type: ignore
 litellm.custom_provider_map = [ # 👈 KEY STEP - REGISTER HANDLER
        {"provider": "my-custom-llm", "custom_handler": my_custom_llm}
    ]
 resp = completion(
        model="my-custom-llm/my-fake-model",
        messages=[{"role": "user", "content": "Hello world!"}],
    )
 assert resp.choices[0].message.content == "Hi!"
 ```
 ## OpenAI Proxy Usage
 1. Setup your `custom_handler.py` file 
 ```python
 import litellm
 from litellm import CustomLLM, completion, get_llm_provider
 class MyCustomLLM(CustomLLM):
    def completion(self, *args, **kwargs) -> litellm.ModelResponse:
        return litellm.completion(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": "Hello world"}],
            mock_response="Hi!",
        )  # type: ignore
    async def acompletion(self, *args, **kwargs) -> litellm.ModelResponse:
        return litellm.completion(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": "Hello world"}],
            mock_response="Hi!",
        )  # type: ignore
 my_custom_llm = MyCustomLLM()
 ```
 2. Add to `config.yaml` 
 In the config below, we pass
 python_filename: `custom_handler.py`
 custom_handler_instance_name: `my_custom_llm`. This is defined in Step 1
 custom_handler: `custom_handler.my_custom_llm`
 ```yaml
 model_list:
  - model_name: "test-model"             
    litellm_params:
      model: "openai/text-embedding-ada-002"
  - model_name: "my-custom-model"
    litellm_params:
      model: "my-custom-llm/my-model"
 litellm_settings:
  custom_provider_map:
  - {"provider": "my-custom-llm", "custom_handler": custom_handler.my_custom_llm}
 ```
 ```bash
 litellm --config /path/to/config.yaml
 ```
 3. Test it! 
 ```bash
 curl -X POST 'http://0.0.0.0:4000/chat/completions' \
 -H 'Content-Type: application/json' \
 -H 'Authorization: Bearer sk-1234' \
 -d '{
    "model": "my-custom-model",
    "messages": [{"role": "user", "content": "Say \"this is a test\" in JSON!"}],
 }'
 ```
 Expected Response
 ```
 {
    "id": "chatcmpl-06f1b9cd-08bc-43f7-9814-a69173921216",
    "choices": [
        {
            "finish_reason": "stop",
            "index": 0,
            "message": {
                "content": "Hi!",
                "role": "assistant",
                "tool_calls": null,
                "function_call": null
            }
        }
    ],
    "created": 1721955063,
    "model": "gpt-3.5-turbo",
    "object": "chat.completion",
    "system_fingerprint": null,
    "usage": {
        "prompt_tokens": 10,
        "completion_tokens": 20,
        "total_tokens": 30
    }
 }
 ```
 ## Custom Handler Spec
 ```python
 from litellm.types.utils import GenericStreamingChunk, ModelResponse
 from typing import Iterator, AsyncIterator
 from litellm.llms.base import BaseLLM
 class CustomLLMError(Exception):  # use this for all your exceptions
    def __init__(
        self,
        status_code,
        message,
    ):
        self.status_code = status_code
        self.message = message
        super().__init__(
            self.message
        )  # Call the base class constructor with the parameters it needs
 class CustomLLM(BaseLLM):
    def __init__(self) -> None:
        super().__init__()
    def completion(self, *args, **kwargs) -> ModelResponse:
        raise CustomLLMError(status_code=500, message="Not implemented yet!")
    def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]:
        raise CustomLLMError(status_code=500, message="Not implemented yet!")
    async def acompletion(self, *args, **kwargs) -> ModelResponse:
        raise CustomLLMError(status_code=500, message="Not implemented yet!")
    async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]:
        raise CustomLLMError(status_code=500, message="Not implemented yet!")
 ```
--- a/docs/my-website/docs/providers/custom_openai_proxy.md
+++ b/docs/my-website/docs/providers/custom_openai_proxy.md
@ -1,129 +0,0 @@
 # Custom API Server (OpenAI Format)
 LiteLLM allows you to call your custom endpoint in the OpenAI ChatCompletion format
 ## API KEYS
 No api keys required
 ## Set up your Custom API Server
 Your server should have the following Endpoints:
 Here's an example OpenAI proxy server with routes: https://replit.com/@BerriAI/openai-proxy#main.py
 ### Required Endpoints
 - POST `/chat/completions` - chat completions endpoint 
 ### Optional Endpoints
 - POST `/completions` - completions endpoint 
 - Get `/models` - available models on server
 - POST `/embeddings` - creates an embedding vector representing the input text.
 ## Example Usage
 ### Call `/chat/completions`
 In order to use your custom OpenAI Chat Completion proxy with LiteLLM, ensure you set
 * `api_base` to your proxy url, example "https://openai-proxy.berriai.repl.co"
 * `custom_llm_provider` to `openai` this ensures litellm uses the `openai.ChatCompletion` to your api_base
 ```python
 import os
 from litellm import completion
 ## set ENV variables
 os.environ["OPENAI_API_KEY"] = "anything" #key is not used for proxy
 messages = [{ "content": "Hello, how are you?","role": "user"}]
 response = completion(
    model="command-nightly", 
    messages=[{ "content": "Hello, how are you?","role": "user"}],
    api_base="https://openai-proxy.berriai.repl.co",
    custom_llm_provider="openai" # litellm will use the openai.ChatCompletion to make the request
 )
 print(response)
 ```
 #### Response
 ```json
 {
    "object":
    "chat.completion",
    "choices": [{
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content":
        "The sky, a canvas of blue,\nA work of art, pure and true,\nA",
        "role": "assistant"
      }
    }],
    "id":
    "chatcmpl-7fbd6077-de10-4cb4-a8a4-3ef11a98b7c8",
    "created":
    1699290237.408061,
    "model":
    "togethercomputer/llama-2-70b-chat",
    "usage": {
      "completion_tokens": 18,
      "prompt_tokens": 14,
      "total_tokens": 32
    }
  }
 ```
 ### Call `/completions`
 In order to use your custom OpenAI Completion proxy with LiteLLM, ensure you set
 * `api_base` to your proxy url, example "https://openai-proxy.berriai.repl.co"
 * `custom_llm_provider` to `text-completion-openai` this ensures litellm uses the `openai.Completion` to your api_base
 ```python
 import os
 from litellm import completion
 ## set ENV variables
 os.environ["OPENAI_API_KEY"] = "anything" #key is not used for proxy
 messages = [{ "content": "Hello, how are you?","role": "user"}]
 response = completion(
    model="command-nightly", 
    messages=[{ "content": "Hello, how are you?","role": "user"}],
    api_base="https://openai-proxy.berriai.repl.co",
    custom_llm_provider="text-completion-openai" # litellm will use the openai.Completion to make the request
 )
 print(response)
 ```
 #### Response 
 ```json
 {
    "warning":
    "This model version is deprecated. Migrate before January 4, 2024 to avoid disruption of service. Learn more https://platform.openai.com/docs/deprecations",
    "id":
    "cmpl-8HxHqF5dymQdALmLplS0dWKZVFe3r",
    "object":
    "text_completion",
    "created":
    1699290166,
    "model":
    "text-davinci-003",
    "choices": [{
      "text":
      "\n\nThe weather in San Francisco varies depending on what time of year and time",
      "index": 0,
      "logprobs": None,
      "finish_reason": "length"
    }],
    "usage": {
      "prompt_tokens": 7,
      "completion_tokens": 16,
      "total_tokens": 23
    }
  }
 ```
--- a/docs/my-website/docs/providers/databricks.md
+++ b/docs/my-website/docs/providers/databricks.md
@ -5,6 +5,11 @@ import TabItem from '@theme/TabItem';
 LiteLLM supports all models on Databricks
 :::tip
 **We support ALL Databricks models, just set `model=databricks/<any-model-on-databricks>` as a prefix when sending litellm requests**
 :::
 ## Usage
@ -185,8 +190,17 @@ response = litellm.embedding(
 ## Supported Databricks Chat Completion Models 
 :::tip
 **We support ALL Databricks models, just set `model=databricks/<any-model-on-databricks>` as a prefix when sending litellm requests**
 :::
 | Model Name                 | Command                                                          |
 |----------------------------|------------------------------------------------------------------|
 | databricks-meta-llama-3-1-70b-instruct    | `completion(model='databricks/databricks-meta-llama-3-1-70b-instruct', messages=messages)`   | 
 | databricks-meta-llama-3-1-405b-instruct    | `completion(model='databricks/databricks-meta-llama-3-1-405b-instruct', messages=messages)`   | 
 | databricks-dbrx-instruct    | `completion(model='databricks/databricks-dbrx-instruct', messages=messages)`   | 
 | databricks-meta-llama-3-70b-instruct    | `completion(model='databricks/databricks-meta-llama-3-70b-instruct', messages=messages)`   | 
 | databricks-llama-2-70b-chat    | `completion(model='databricks/databricks-llama-2-70b-chat', messages=messages)`   | 
@ -196,6 +210,13 @@ response = litellm.embedding(
 ## Supported Databricks Embedding Models 
 :::tip
 **We support ALL Databricks models, just set `model=databricks/<any-model-on-databricks>` as a prefix when sending litellm requests**
 :::
 | Model Name                 | Command                                                          |
 |----------------------------|------------------------------------------------------------------|
 | databricks-bge-large-en    | `embedding(model='databricks/databricks-bge-large-en', messages=messages)`   |
--- a/docs/my-website/docs/providers/fireworks_ai.md
+++ b/docs/my-website/docs/providers/fireworks_ai.md
@ -1,7 +1,12 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # Fireworks AI
 https://fireworks.ai/
 :::info
 **We support ALL Fireworks AI models, just set `fireworks_ai/` as a prefix when sending completion requests**
 :::
 ## API Key
 ```python
@ -16,7 +21,7 @@ import os
 os.environ['FIREWORKS_AI_API_KEY'] = ""
 response = completion(
-    model="fireworks_ai/mixtral-8x7b-instruct", 
+    model="fireworks_ai/accounts/fireworks/models/llama-v3-70b-instruct", 
    messages=[
       {"role": "user", "content": "hello from litellm"}
   ],
@ -31,7 +36,7 @@ import os
 os.environ['FIREWORKS_AI_API_KEY'] = ""
 response = completion(
-    model="fireworks_ai/mixtral-8x7b-instruct", 
+    model="fireworks_ai/accounts/fireworks/models/llama-v3-70b-instruct", 
    messages=[
       {"role": "user", "content": "hello from litellm"}
   ],
@ -43,8 +48,103 @@ for chunk in response:
 ```
 ## Usage with LiteLLM Proxy 
 ### 1. Set Fireworks AI Models on config.yaml
 ```yaml
 model_list:
  - model_name: fireworks-llama-v3-70b-instruct
    litellm_params:
      model: fireworks_ai/accounts/fireworks/models/llama-v3-70b-instruct
      api_key: "os.environ/FIREWORKS_AI_API_KEY"
 ```
 ### 2. Start Proxy 
 ```
 litellm --config config.yaml
 ```
 ### 3. Test it
 <Tabs>
 <TabItem value="Curl" label="Curl Request">
 ```shell
 curl --location 'http://0.0.0.0:4000/chat/completions' \
 --header 'Content-Type: application/json' \
 --data ' {
      "model": "fireworks-llama-v3-70b-instruct",
      "messages": [
        {
          "role": "user",
          "content": "what llm are you"
        }
      ]
    }
 '
 ```
 </TabItem>
 <TabItem value="openai" label="OpenAI v1.0.0+">
 ```python
 import openai
 client = openai.OpenAI(
    api_key="anything",
    base_url="http://0.0.0.0:4000"
 )
 # request sent to model set on litellm proxy, `litellm --model`
 response = client.chat.completions.create(model="fireworks-llama-v3-70b-instruct", messages = [
    {
        "role": "user",
        "content": "this is a test request, write a short poem"
    }
 ])
 print(response)
 ```
 </TabItem>
 <TabItem value="langchain" label="Langchain">
 ```python
 from langchain.chat_models import ChatOpenAI
 from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
 )
 from langchain.schema import HumanMessage, SystemMessage
 chat = ChatOpenAI(
    openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
    model = "fireworks-llama-v3-70b-instruct",
    temperature=0.1
 )
 messages = [
    SystemMessage(
        content="You are a helpful assistant that im using to make a test request to."
    ),
    HumanMessage(
        content="test from litellm. tell me why it's amazing in 1 sentence"
    ),
 ]
 response = chat(messages)
 print(response)
 ```
 </TabItem>
 </Tabs>
 ## Supported Models - ALL Fireworks AI Models Supported!
 :::info
 We support ALL Fireworks AI models, just set `fireworks_ai/` as a prefix when sending completion requests
 :::
 | Model Name               | Function Call                                                                                                                                                      |
 |--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
--- a/docs/my-website/docs/providers/friendliai.md
+++ b/docs/my-website/docs/providers/friendliai.md
@ -0,0 +1,60 @@
 # FriendliAI
 https://suite.friendli.ai/
 **We support ALL FriendliAI models, just set `friendliai/` as a prefix when sending completion requests**
 ## API Key
 ```python
 # env variable
 os.environ['FRIENDLI_TOKEN']
 os.environ['FRIENDLI_API_BASE'] # Optional. Set this when using dedicated endpoint.
 ```
 ## Sample Usage
 ```python
 from litellm import completion
 import os
 os.environ['FRIENDLI_TOKEN'] = ""
 response = completion(
    model="friendliai/mixtral-8x7b-instruct-v0-1", 
    messages=[
       {"role": "user", "content": "hello from litellm"}
   ],
 )
 print(response)
 ```
 ## Sample Usage - Streaming
 ```python
 from litellm import completion
 import os
 os.environ['FRIENDLI_TOKEN'] = ""
 response = completion(
    model="friendliai/mixtral-8x7b-instruct-v0-1", 
    messages=[
       {"role": "user", "content": "hello from litellm"}
   ],
    stream=True
 )
 for chunk in response:
    print(chunk)
 ```
 ## Supported Models
 ### Serverless Endpoints
 We support ALL FriendliAI AI models, just set `friendliai/` as a prefix when sending completion requests
 | Model Name               | Function Call                                                                                                                                                      |
 |--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | mixtral-8x7b-instruct | `completion(model="friendliai/mixtral-8x7b-instruct-v0-1", messages)` | 
 | meta-llama-3-8b-instruct | `completion(model="friendliai/meta-llama-3-8b-instruct", messages)` |
 | meta-llama-3-70b-instruct | `completion(model="friendliai/meta-llama-3-70b-instruct", messages)` |  
 ### Dedicated Endpoints
 ```
 model="friendliai/$ENDPOINT_ID:$ADAPTER_ROUTE"
 ```
--- a/docs/my-website/docs/providers/gemini.md
+++ b/docs/my-website/docs/providers/gemini.md
@ -1,3 +1,7 @@
 import Image from '@theme/IdealImage';
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # Gemini - Google AI Studio
 ## Pre-requisites
@ -17,6 +21,335 @@ response = completion(
 )
 ```
 ## Supported OpenAI Params
 - temperature
 - top_p
 - max_tokens
 - stream
 - tools
 - tool_choice
 - response_format
 - n
 - stop
 [**See Updated List**](https://github.com/BerriAI/litellm/blob/1c747f3ad372399c5b95cc5696b06a5fbe53186b/litellm/llms/vertex_httpx.py#L122)
 ## Passing Gemini Specific Params
 ### Response schema 
 LiteLLM supports sending `response_schema` as a param for Gemini-1.5-Pro on Google AI Studio. 
 **Response Schema**
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 from litellm import completion 
 import json 
 import os 
 os.environ['GEMINI_API_KEY'] = ""
 messages = [
    {
        "role": "user",
        "content": "List 5 popular cookie recipes."
    }
 ]
 response_schema = {
        "type": "array",
        "items": {
            "type": "object",
            "properties": {
                "recipe_name": {
                    "type": "string",
                },
            },
            "required": ["recipe_name"],
        },
    }
 completion(
    model="gemini/gemini-1.5-pro", 
    messages=messages, 
    response_format={"type": "json_object", "response_schema": response_schema} # 👈 KEY CHANGE
    )
 print(json.loads(completion.choices[0].message.content))
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 1. Add model to config.yaml
 ```yaml
 model_list:
  - model_name: gemini-pro
    litellm_params:
      model: gemini/gemini-1.5-pro
      api_key: os.environ/GEMINI_API_KEY
 ```
 2. Start Proxy 
 ```
 $ litellm --config /path/to/config.yaml
 ```
 3. Make Request!
 ```bash
 curl -X POST 'http://0.0.0.0:4000/chat/completions' \
 -H 'Content-Type: application/json' \
 -H 'Authorization: Bearer sk-1234' \
 -D '{
  "model": "gemini-pro",
  "messages": [
        {"role": "user", "content": "List 5 popular cookie recipes."}
    ],
  "response_format": {"type": "json_object", "response_schema": { 
        "type": "array",
        "items": {
            "type": "object",
            "properties": {
                "recipe_name": {
                    "type": "string",
                },
            },
            "required": ["recipe_name"],
        },
    }}
 }
 '
 ```
 </TabItem>
 </Tabs>
 **Validate Schema**
 To validate the response_schema, set `enforce_validation: true`.
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 from litellm import completion, JSONSchemaValidationError
 try: 
 	completion(
    model="gemini/gemini-1.5-pro", 
    messages=messages, 
    response_format={
        "type": "json_object", 
        "response_schema": response_schema,
        "enforce_validation": true # 👈 KEY CHANGE
    }
 	)
 except JSONSchemaValidationError as e: 
 	print("Raw Response: {}".format(e.raw_response))
 	raise e
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 1. Add model to config.yaml
 ```yaml
 model_list:
  - model_name: gemini-pro
    litellm_params:
      model: gemini/gemini-1.5-pro
      api_key: os.environ/GEMINI_API_KEY
 ```
 2. Start Proxy 
 ```
 $ litellm --config /path/to/config.yaml
 ```
 3. Make Request!
 ```bash
 curl -X POST 'http://0.0.0.0:4000/chat/completions' \
 -H 'Content-Type: application/json' \
 -H 'Authorization: Bearer sk-1234' \
 -D '{
  "model": "gemini-pro",
  "messages": [
        {"role": "user", "content": "List 5 popular cookie recipes."}
    ],
  "response_format": {"type": "json_object", "response_schema": { 
        "type": "array",
        "items": {
            "type": "object",
            "properties": {
                "recipe_name": {
                    "type": "string",
                },
            },
            "required": ["recipe_name"],
        },
    }, 
    "enforce_validation": true
    }
 }
 '
 ```
 </TabItem>
 </Tabs>
 LiteLLM will validate the response against the schema, and raise a `JSONSchemaValidationError` if the response does not match the schema. 
 JSONSchemaValidationError inherits from `openai.APIError` 
 Access the raw response with `e.raw_response`
 ### GenerationConfig Params 
 To pass additional GenerationConfig params - e.g. `topK`, just pass it in the request body of the call, and LiteLLM will pass it straight through as a key-value pair in the request body. 
 [**See Gemini GenerationConfigParams**](https://ai.google.dev/api/generate-content#v1beta.GenerationConfig)
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 from litellm import completion 
 import json 
 import os 
 os.environ['GEMINI_API_KEY'] = ""
 messages = [
    {
        "role": "user",
        "content": "List 5 popular cookie recipes."
    }
 ]
 completion(
    model="gemini/gemini-1.5-pro", 
    messages=messages, 
    topK=1 # 👈 KEY CHANGE
 )
 print(json.loads(completion.choices[0].message.content))
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 1. Add model to config.yaml
 ```yaml
 model_list:
  - model_name: gemini-pro
    litellm_params:
      model: gemini/gemini-1.5-pro
      api_key: os.environ/GEMINI_API_KEY
 ```
 2. Start Proxy 
 ```
 $ litellm --config /path/to/config.yaml
 ```
 3. Make Request!
 ```bash
 curl -X POST 'http://0.0.0.0:4000/chat/completions' \
 -H 'Content-Type: application/json' \
 -H 'Authorization: Bearer sk-1234' \
 -d '{
  "model": "gemini-pro",
  "messages": [
        {"role": "user", "content": "List 5 popular cookie recipes."}
    ],
  "topK": 1 # 👈 KEY CHANGE
 }
 '
 ```
 </TabItem>
 </Tabs>
 **Validate Schema**
 To validate the response_schema, set `enforce_validation: true`.
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 from litellm import completion, JSONSchemaValidationError
 try: 
 	completion(
    model="gemini/gemini-1.5-pro", 
    messages=messages, 
    response_format={
        "type": "json_object", 
        "response_schema": response_schema,
        "enforce_validation": true # 👈 KEY CHANGE
    }
 	)
 except JSONSchemaValidationError as e: 
 	print("Raw Response: {}".format(e.raw_response))
 	raise e
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 1. Add model to config.yaml
 ```yaml
 model_list:
  - model_name: gemini-pro
    litellm_params:
      model: gemini/gemini-1.5-pro
      api_key: os.environ/GEMINI_API_KEY
 ```
 2. Start Proxy 
 ```
 $ litellm --config /path/to/config.yaml
 ```
 3. Make Request!
 ```bash
 curl -X POST 'http://0.0.0.0:4000/chat/completions' \
 -H 'Content-Type: application/json' \
 -H 'Authorization: Bearer sk-1234' \
 -D '{
  "model": "gemini-pro",
  "messages": [
        {"role": "user", "content": "List 5 popular cookie recipes."}
    ],
  "response_format": {"type": "json_object", "response_schema": { 
        "type": "array",
        "items": {
            "type": "object",
            "properties": {
                "recipe_name": {
                    "type": "string",
                },
            },
            "required": ["recipe_name"],
        },
    }, 
    "enforce_validation": true
    }
 }
 '
 ```
 </TabItem>
 </Tabs>
 ## Specifying Safety Settings 
 In certain use-cases you may need to make calls to the models and pass [safety settigns](https://ai.google.dev/docs/safety_setting_gemini) different from the defaults. To do so, simple pass the `safety_settings` argument to `completion` or `acompletion`. For example:
@ -91,6 +424,72 @@ assert isinstance(
 ```
 ## JSON Mode
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 from litellm import completion 
 import json 
 import os 
 os.environ['GEMINI_API_KEY'] = ""
 messages = [
    {
        "role": "user",
        "content": "List 5 popular cookie recipes."
    }
 ]
 completion(
    model="gemini/gemini-1.5-pro", 
    messages=messages, 
    response_format={"type": "json_object"} # 👈 KEY CHANGE
 )
 print(json.loads(completion.choices[0].message.content))
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 1. Add model to config.yaml
 ```yaml
 model_list:
  - model_name: gemini-pro
    litellm_params:
      model: gemini/gemini-1.5-pro
      api_key: os.environ/GEMINI_API_KEY
 ```
 2. Start Proxy 
 ```
 $ litellm --config /path/to/config.yaml
 ```
 3. Make Request!
 ```bash
 curl -X POST 'http://0.0.0.0:4000/chat/completions' \
 -H 'Content-Type: application/json' \
 -H 'Authorization: Bearer sk-1234' \
 -d '{
  "model": "gemini-pro",
  "messages": [
        {"role": "user", "content": "List 5 popular cookie recipes."}
    ],
  "response_format": {"type": "json_object"}
 }
 '
 ```
 </TabItem>
 </Tabs>
 # Gemini-Pro-Vision
 LiteLLM Supports the following image types passed in `url`
 - Images with direct links - https://storage.googleapis.com/github-repo/img/gemini/intro/landmark3.jpg
@ -141,8 +540,13 @@ print(content)
 ```
 ## Chat Models
 :::tip
 **We support ALL Gemini models, just set `model=gemini/<any-model-on-gemini>` as a prefix when sending litellm requests**
 :::
 | Model Name            | Function Call                                          | Required OS Variables          |
 |-----------------------|--------------------------------------------------------|--------------------------------|
-| gemini-pro            | `completion('gemini/gemini-pro', messages)`            | `os.environ['GEMINI_API_KEY']` |
+| gemini-pro            | `completion(model='gemini/gemini-pro', messages)`            | `os.environ['GEMINI_API_KEY']` |
-| gemini-1.5-pro-latest | `completion('gemini/gemini-1.5-pro-latest', messages)` | `os.environ['GEMINI_API_KEY']` |
+| gemini-1.5-pro-latest | `completion(model='gemini/gemini-1.5-pro-latest', messages)` | `os.environ['GEMINI_API_KEY']` |
-| gemini-pro-vision     | `completion('gemini/gemini-pro-vision', messages)`     | `os.environ['GEMINI_API_KEY']` |
+| gemini-pro-vision     | `completion(model='gemini/gemini-pro-vision', messages)`     | `os.environ['GEMINI_API_KEY']` |
--- a/docs/my-website/docs/providers/github.md
+++ b/docs/my-website/docs/providers/github.md
@ -0,0 +1,261 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # 🆕 Github
 https://github.com/marketplace/models
 :::tip
 **We support ALL Github models, just set `model=github/<any-model-on-github>` as a prefix when sending litellm requests**
 :::
 ## API Key
 ```python
 # env variable
 os.environ['GITHUB_API_KEY']
 ```
 ## Sample Usage
 ```python
 from litellm import completion
 import os
 os.environ['GITHUB_API_KEY'] = ""
 response = completion(
    model="github/llama3-8b-8192", 
    messages=[
       {"role": "user", "content": "hello from litellm"}
   ],
 )
 print(response)
 ```
 ## Sample Usage - Streaming
 ```python
 from litellm import completion
 import os
 os.environ['GITHUB_API_KEY'] = ""
 response = completion(
    model="github/llama3-8b-8192", 
    messages=[
       {"role": "user", "content": "hello from litellm"}
   ],
    stream=True
 )
 for chunk in response:
    print(chunk)
 ```
 ## Usage with LiteLLM Proxy 
 ### 1. Set Github Models on config.yaml
 ```yaml
 model_list:
  - model_name: github-llama3-8b-8192 # Model Alias to use for requests
    litellm_params:
      model: github/llama3-8b-8192
      api_key: "os.environ/GITHUB_API_KEY" # ensure you have `GITHUB_API_KEY` in your .env
 ```
 ### 2. Start Proxy 
 ```
 litellm --config config.yaml
 ```
 ### 3. Test it
 Make request to litellm proxy
 <Tabs>
 <TabItem value="Curl" label="Curl Request">
 ```shell
 curl --location 'http://0.0.0.0:4000/chat/completions' \
 --header 'Content-Type: application/json' \
 --data ' {
      "model": "github-llama3-8b-8192",
      "messages": [
        {
          "role": "user",
          "content": "what llm are you"
        }
      ]
    }
 '
 ```
 </TabItem>
 <TabItem value="openai" label="OpenAI v1.0.0+">
 ```python
 import openai
 client = openai.OpenAI(
    api_key="anything",
    base_url="http://0.0.0.0:4000"
 )
 response = client.chat.completions.create(model="github-llama3-8b-8192", messages = [
    {
        "role": "user",
        "content": "this is a test request, write a short poem"
    }
 ])
 print(response)
 ```
 </TabItem>
 <TabItem value="langchain" label="Langchain">
 ```python
 from langchain.chat_models import ChatOpenAI
 from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
 )
 from langchain.schema import HumanMessage, SystemMessage
 chat = ChatOpenAI(
    openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
    model = "github-llama3-8b-8192",
    temperature=0.1
 )
 messages = [
    SystemMessage(
        content="You are a helpful assistant that im using to make a test request to."
    ),
    HumanMessage(
        content="test from litellm. tell me why it's amazing in 1 sentence"
    ),
 ]
 response = chat(messages)
 print(response)
 ```
 </TabItem>
 </Tabs>
 ## Supported Models - ALL Github Models Supported!
 We support ALL Github models, just set `github/` as a prefix when sending completion requests
 | Model Name         | Usage                                           |
 |--------------------|---------------------------------------------------------|
 | llama-3.1-8b-instant     | `completion(model="github/llama-3.1-8b-instant", messages)`     | 
 | llama-3.1-70b-versatile    | `completion(model="github/llama-3.1-70b-versatile", messages)`    | 
 | llama-3.1-405b-reasoning    | `completion(model="github/llama-3.1-405b-reasoning", messages)`    | 
 | llama3-8b-8192     | `completion(model="github/llama3-8b-8192", messages)`     | 
 | llama3-70b-8192    | `completion(model="github/llama3-70b-8192", messages)`    | 
 | llama2-70b-4096    | `completion(model="github/llama2-70b-4096", messages)`    | 
 | mixtral-8x7b-32768 | `completion(model="github/mixtral-8x7b-32768", messages)` |
 | gemma-7b-it        | `completion(model="github/gemma-7b-it", messages)`        |  
 ## Github - Tool / Function Calling Example
 ```python
 # Example dummy function hard coded to return the current weather
 import json
 def get_current_weather(location, unit="fahrenheit"):
    """Get the current weather in a given location"""
    if "tokyo" in location.lower():
        return json.dumps({"location": "Tokyo", "temperature": "10", "unit": "celsius"})
    elif "san francisco" in location.lower():
        return json.dumps(
            {"location": "San Francisco", "temperature": "72", "unit": "fahrenheit"}
        )
    elif "paris" in location.lower():
        return json.dumps({"location": "Paris", "temperature": "22", "unit": "celsius"})
    else:
        return json.dumps({"location": location, "temperature": "unknown"})
 # Step 1: send the conversation and available functions to the model
 messages = [
    {
        "role": "system",
        "content": "You are a function calling LLM that uses the data extracted from get_current_weather to answer questions about the weather in San Francisco.",
    },
    {
        "role": "user",
        "content": "What's the weather like in San Francisco?",
    },
 ]
 tools = [
    {
        "type": "function",
        "function": {
            "name": "get_current_weather",
            "description": "Get the current weather in a given location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city and state, e.g. San Francisco, CA",
                    },
                    "unit": {
                        "type": "string",
                        "enum": ["celsius", "fahrenheit"],
                    },
                },
                "required": ["location"],
            },
        },
    }
 ]
 response = litellm.completion(
    model="github/llama3-8b-8192",
    messages=messages,
    tools=tools,
    tool_choice="auto",  # auto is default, but we'll be explicit
 )
 print("Response\n", response)
 response_message = response.choices[0].message
 tool_calls = response_message.tool_calls
 # Step 2: check if the model wanted to call a function
 if tool_calls:
    # Step 3: call the function
    # Note: the JSON response may not always be valid; be sure to handle errors
    available_functions = {
        "get_current_weather": get_current_weather,
    }
    messages.append(
        response_message
    )  # extend conversation with assistant's reply
    print("Response message\n", response_message)
    # Step 4: send the info for each function call and function response to the model
    for tool_call in tool_calls:
        function_name = tool_call.function.name
        function_to_call = available_functions[function_name]
        function_args = json.loads(tool_call.function.arguments)
        function_response = function_to_call(
            location=function_args.get("location"),
            unit=function_args.get("unit"),
        )
        messages.append(
            {
                "tool_call_id": tool_call.id,
                "role": "tool",
                "name": function_name,
                "content": function_response,
            }
        )  # extend conversation with function response
    print(f"messages: {messages}")
    second_response = litellm.completion(
        model="github/llama3-8b-8192", messages=messages
    )  # get a new response from the model where it can see the function response
    print("second response\n", second_response)
 ```
--- a/docs/my-website/docs/providers/groq.md
+++ b/docs/my-website/docs/providers/groq.md
@ -1,3 +1,6 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # Groq
 https://groq.com/
@ -20,7 +23,7 @@ import os
 os.environ['GROQ_API_KEY'] = ""
 response = completion(
-    model="groq/llama2-70b-4096", 
+    model="groq/llama3-8b-8192", 
    messages=[
       {"role": "user", "content": "hello from litellm"}
   ],
@ -35,7 +38,7 @@ import os
 os.environ['GROQ_API_KEY'] = ""
 response = completion(
-    model="groq/llama2-70b-4096", 
+    model="groq/llama3-8b-8192", 
    messages=[
       {"role": "user", "content": "hello from litellm"}
   ],
@ -47,11 +50,109 @@ for chunk in response:
 ```
 ## Usage with LiteLLM Proxy 
 ### 1. Set Groq Models on config.yaml
 ```yaml
 model_list:
  - model_name: groq-llama3-8b-8192 # Model Alias to use for requests
    litellm_params:
      model: groq/llama3-8b-8192
      api_key: "os.environ/GROQ_API_KEY" # ensure you have `GROQ_API_KEY` in your .env
 ```
 ### 2. Start Proxy 
 ```
 litellm --config config.yaml
 ```
 ### 3. Test it
 Make request to litellm proxy
 <Tabs>
 <TabItem value="Curl" label="Curl Request">
 ```shell
 curl --location 'http://0.0.0.0:4000/chat/completions' \
 --header 'Content-Type: application/json' \
 --data ' {
      "model": "groq-llama3-8b-8192",
      "messages": [
        {
          "role": "user",
          "content": "what llm are you"
        }
      ]
    }
 '
 ```
 </TabItem>
 <TabItem value="openai" label="OpenAI v1.0.0+">
 ```python
 import openai
 client = openai.OpenAI(
    api_key="anything",
    base_url="http://0.0.0.0:4000"
 )
 response = client.chat.completions.create(model="groq-llama3-8b-8192", messages = [
    {
        "role": "user",
        "content": "this is a test request, write a short poem"
    }
 ])
 print(response)
 ```
 </TabItem>
 <TabItem value="langchain" label="Langchain">
 ```python
 from langchain.chat_models import ChatOpenAI
 from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
 )
 from langchain.schema import HumanMessage, SystemMessage
 chat = ChatOpenAI(
    openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
    model = "groq-llama3-8b-8192",
    temperature=0.1
 )
 messages = [
    SystemMessage(
        content="You are a helpful assistant that im using to make a test request to."
    ),
    HumanMessage(
        content="test from litellm. tell me why it's amazing in 1 sentence"
    ),
 ]
 response = chat(messages)
 print(response)
 ```
 </TabItem>
 </Tabs>
 ## Supported Models - ALL Groq Models Supported!
 We support ALL Groq models, just set `groq/` as a prefix when sending completion requests
-| Model Name         | Function Call                                           |
+| Model Name         | Usage                                           |
 |--------------------|---------------------------------------------------------|
 | llama-3.1-8b-instant     | `completion(model="groq/llama-3.1-8b-instant", messages)`     | 
 | llama-3.1-70b-versatile    | `completion(model="groq/llama-3.1-70b-versatile", messages)`    | 
 | llama-3.1-405b-reasoning    | `completion(model="groq/llama-3.1-405b-reasoning", messages)`    | 
 | llama3-8b-8192     | `completion(model="groq/llama3-8b-8192", messages)`     | 
 | llama3-70b-8192    | `completion(model="groq/llama3-70b-8192", messages)`    | 
 | llama2-70b-4096    | `completion(model="groq/llama2-70b-4096", messages)`    | 
@ -114,7 +215,7 @@ tools = [
    }
 ]
 response = litellm.completion(
-    model="groq/llama2-70b-4096",
+    model="groq/llama3-8b-8192",
    messages=messages,
    tools=tools,
    tool_choice="auto",  # auto is default, but we'll be explicit
@ -154,7 +255,7 @@ if tool_calls:
        )  # extend conversation with function response
    print(f"messages: {messages}")
    second_response = litellm.completion(
-        model="groq/llama2-70b-4096", messages=messages
+        model="groq/llama3-8b-8192", messages=messages
    )  # get a new response from the model where it can see the function response
    print("second response\n", second_response)
 ```
--- a/docs/my-website/docs/providers/mistral.md
+++ b/docs/my-website/docs/providers/mistral.md
@ -1,3 +1,6 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # Mistral AI API
 https://docs.mistral.ai/api/
@ -41,18 +44,120 @@ for chunk in response:
 ```
 ## Usage with LiteLLM Proxy 
 ### 1. Set Mistral Models on config.yaml
 ```yaml
 model_list:
  - model_name: mistral-small-latest
    litellm_params:
      model: mistral/mistral-small-latest
      api_key: "os.environ/MISTRAL_API_KEY" # ensure you have `MISTRAL_API_KEY` in your .env
 ```
 ### 2. Start Proxy 
 ```
 litellm --config config.yaml
 ```
 ### 3. Test it
 <Tabs>
 <TabItem value="Curl" label="Curl Request">
 ```shell
 curl --location 'http://0.0.0.0:4000/chat/completions' \
 --header 'Content-Type: application/json' \
 --data ' {
      "model": "mistral-small-latest",
      "messages": [
        {
          "role": "user",
          "content": "what llm are you"
        }
      ]
    }
 '
 ```
 </TabItem>
 <TabItem value="openai" label="OpenAI v1.0.0+">
 ```python
 import openai
 client = openai.OpenAI(
    api_key="anything",
    base_url="http://0.0.0.0:4000"
 )
 response = client.chat.completions.create(model="mistral-small-latest", messages = [
    {
        "role": "user",
        "content": "this is a test request, write a short poem"
    }
 ])
 print(response)
 ```
 </TabItem>
 <TabItem value="langchain" label="Langchain">
 ```python
 from langchain.chat_models import ChatOpenAI
 from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
 )
 from langchain.schema import HumanMessage, SystemMessage
 chat = ChatOpenAI(
    openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
    model = "mistral-small-latest",
    temperature=0.1
 )
 messages = [
    SystemMessage(
        content="You are a helpful assistant that im using to make a test request to."
    ),
    HumanMessage(
        content="test from litellm. tell me why it's amazing in 1 sentence"
    ),
 ]
 response = chat(messages)
 print(response)
 ```
 </TabItem>
 </Tabs>
 ## Supported Models
 :::info
 All models listed here https://docs.mistral.ai/platform/endpoints are supported. We actively maintain the list of models, pricing, token window, etc. [here](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json).
 :::
 | Model Name     | Function Call                                                |
 |----------------|--------------------------------------------------------------|
 | Mistral Small  | `completion(model="mistral/mistral-small-latest", messages)` |
 | Mistral Medium | `completion(model="mistral/mistral-medium-latest", messages)`|
-| Mistral Large  | `completion(model="mistral/mistral-large-latest", messages)` |
+| Mistral Large 2  | `completion(model="mistral/mistral-large-2407", messages)` |
 | Mistral Large Latest  | `completion(model="mistral/mistral-large-latest", messages)` |
 | Mistral 7B     | `completion(model="mistral/open-mistral-7b", messages)`      |
 | Mixtral 8x7B   | `completion(model="mistral/open-mixtral-8x7b", messages)`    |
 | Mixtral 8x22B  | `completion(model="mistral/open-mixtral-8x22b", messages)`   |
 | Codestral      | `completion(model="mistral/codestral-latest", messages)`     |
 | Mistral NeMo      | `completion(model="mistral/open-mistral-nemo", messages)`     |
 | Mistral NeMo 2407      | `completion(model="mistral/open-mistral-nemo-2407", messages)`     |
 | Codestral Mamba      | `completion(model="mistral/open-codestral-mamba", messages)`     |
 | Codestral Mamba    | `completion(model="mistral/codestral-mamba-latest"", messages)`     |
 ## Function Calling 
--- a/docs/my-website/docs/providers/ollama.md
+++ b/docs/my-website/docs/providers/ollama.md
@ -1,3 +1,6 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # Ollama 
 LiteLLM supports all models from [Ollama](https://github.com/ollama/ollama)
@ -84,6 +87,120 @@ response = completion(
 )
 ```
 ## Example Usage - Tool Calling 
 To use ollama tool calling, pass `tools=[{..}]` to `litellm.completion()` 
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 from litellm import completion
 import litellm 
 ## [OPTIONAL] REGISTER MODEL - not all ollama models support function calling, litellm defaults to json mode tool calls if native tool calling not supported.
 # litellm.register_model(model_cost={
 #                 "ollama_chat/llama3.1": { 
 #                   "supports_function_calling": true
 #                 },
 #             })
 tools = [
  {
    "type": "function",
    "function": {
      "name": "get_current_weather",
      "description": "Get the current weather in a given location",
      "parameters": {
        "type": "object",
        "properties": {
          "location": {
            "type": "string",
            "description": "The city and state, e.g. San Francisco, CA",
          },
          "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
        },
        "required": ["location"],
      },
    }
  }
 ]
 messages = [{"role": "user", "content": "What's the weather like in Boston today?"}]
 response = completion(
  model="ollama_chat/llama3.1",
  messages=messages,
  tools=tools
 )
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 1. Setup config.yaml 
 ```yaml
 model_list:
  - model_name: "llama3.1"             
    litellm_params:
      model: "ollama_chat/llama3.1"
    model_info:
      supports_function_calling: true
 ```
 2. Start proxy 
 ```bash
 litellm --config /path/to/config.yaml
 ```
 3. Test it! 
 ```bash
 curl -X POST 'http://0.0.0.0:4000/chat/completions' \
 -H 'Content-Type: application/json' \
 -H 'Authorization: Bearer sk-1234' \
 -d '{
    "model": "llama3.1",
  "messages": [
    {
      "role": "user",
      "content": "What'\''s the weather like in Boston today?"
    }
  ],
  "tools": [
    {
      "type": "function",
      "function": {
        "name": "get_current_weather",
        "description": "Get the current weather in a given location",
        "parameters": {
          "type": "object",
          "properties": {
            "location": {
              "type": "string",
              "description": "The city and state, e.g. San Francisco, CA"
            },
            "unit": {
              "type": "string",
              "enum": ["celsius", "fahrenheit"]
            }
          },
          "required": ["location"]
        }
      }
    }
  ],
  "tool_choice": "auto",
  "stream": true
 }'
 ```
 </TabItem>
 </Tabs>
 ## Using ollama `api/chat` 
 In order to send ollama requests to `POST /api/chat` on your ollama server, set the model prefix to `ollama_chat`
--- a/docs/my-website/docs/providers/openai.md
+++ b/docs/my-website/docs/providers/openai.md
@ -163,7 +163,10 @@ os.environ["OPENAI_API_BASE"] = "openaiai-api-base"     # OPTIONAL
 | Model Name            | Function Call                                                   |
 |-----------------------|-----------------------------------------------------------------|
 | gpt-4o-mini  | `response = completion(model="gpt-4o-mini", messages=messages)` |
 | gpt-4o-mini-2024-07-18   | `response = completion(model="gpt-4o-mini-2024-07-18", messages=messages)` |
 | gpt-4o   | `response = completion(model="gpt-4o", messages=messages)` |
 | gpt-4o-2024-08-06   | `response = completion(model="gpt-4o-2024-08-06", messages=messages)` |
 | gpt-4o-2024-05-13   | `response = completion(model="gpt-4o-2024-05-13", messages=messages)` |
 | gpt-4-turbo   | `response = completion(model="gpt-4-turbo", messages=messages)` |
 | gpt-4-turbo-preview   | `response = completion(model="gpt-4-0125-preview", messages=messages)` |
@ -236,6 +239,104 @@ response = completion(
 ## Advanced
 ### Getting OpenAI API Response Headers 
 Set `litellm.return_response_headers = True` to get raw response headers from OpenAI
 You can expect to always get the `_response_headers` field from `litellm.completion()`, `litellm.embedding()` functions
 <Tabs>
 <TabItem value="litellm.completion" label="litellm.completion">
 ```python
 litellm.return_response_headers = True
 # /chat/completion
 response = completion(
    model="gpt-4o-mini",
    messages=[
        {
            "role": "user",
            "content": "hi",
        }
    ],
 )
 print(f"response: {response}")
 print("_response_headers=", response._response_headers)
 ```
 </TabItem>
 <TabItem value="litellm.completion - streaming" label="litellm.completion + stream">
 ```python
 litellm.return_response_headers = True
 # /chat/completion
 response = completion(
    model="gpt-4o-mini",
    stream=True,
    messages=[
        {
            "role": "user",
            "content": "hi",
        }
    ],
 )
 print(f"response: {response}")
 print("response_headers=", response._response_headers)
 for chunk in response:
    print(chunk)
 ```
 </TabItem>
 <TabItem value="litellm.embedding" label="litellm.embedding">
 ```python
 litellm.return_response_headers = True
 # embedding
 embedding_response = litellm.embedding(
    model="text-embedding-ada-002",
    input="hello",
 )
 embedding_response_headers = embedding_response._response_headers
 print("embedding_response_headers=", embedding_response_headers)
 ```
 </TabItem>
 </Tabs>
 Expected Response Headers from OpenAI
 ```json
 {
  "date": "Sat, 20 Jul 2024 22:05:23 GMT",
  "content-type": "application/json",
  "transfer-encoding": "chunked",
  "connection": "keep-alive",
  "access-control-allow-origin": "*",
  "openai-model": "text-embedding-ada-002",
  "openai-organization": "*****",
  "openai-processing-ms": "20",
  "openai-version": "2020-10-01",
  "strict-transport-security": "max-age=15552000; includeSubDomains; preload",
  "x-ratelimit-limit-requests": "5000",
  "x-ratelimit-limit-tokens": "5000000",
  "x-ratelimit-remaining-requests": "4999",
  "x-ratelimit-remaining-tokens": "4999999",
  "x-ratelimit-reset-requests": "12ms",
  "x-ratelimit-reset-tokens": "0s",
  "x-request-id": "req_cc37487bfd336358231a17034bcfb4d9",
  "cf-cache-status": "DYNAMIC",
  "set-cookie": "__cf_bm=E_FJY8fdAIMBzBE2RZI2.OkMIO3lf8Hz.ydBQJ9m3q8-1721513123-1.0.1.1-6OK0zXvtd5s9Jgqfz66cU9gzQYpcuh_RLaUZ9dOgxR9Qeq4oJlu.04C09hOTCFn7Hg.k.2tiKLOX24szUE2shw; path=/; expires=Sat, 20-Jul-24 22:35:23 GMT; domain=.api.openai.com; HttpOnly; Secure; SameSite=None, *cfuvid=SDndIImxiO3U0aBcVtoy1TBQqYeQtVDo1L6*Nlpp7EU-1721513123215-0.0.1.1-604800000; path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None",
  "x-content-type-options": "nosniff",
  "server": "cloudflare",
  "cf-ray": "8a66409b4f8acee9-SJC",
  "content-encoding": "br",
  "alt-svc": "h3=\":443\"; ma=86400"
 }
 ```
 ### Parallel Function calling
 See a detailed walthrough of parallel function calling with litellm [here](https://docs.litellm.ai/docs/completion/function_call)
 ```python
--- a/docs/my-website/docs/providers/perplexity.md
+++ b/docs/my-website/docs/providers/perplexity.md
@ -1,3 +1,6 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # Perplexity AI (pplx-api)
 https://www.perplexity.ai
@ -38,7 +41,7 @@ for chunk in response:
 ## Supported Models
-All models listed here https://docs.perplexity.ai/docs/model-cards are supported
+All models listed here https://docs.perplexity.ai/docs/model-cards are supported.  Just do `model=perplexity/<model-name>`.
 | Model Name               | Function Call                                                                                                                                                      |
 |--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
@ -60,3 +63,72 @@ All models listed here https://docs.perplexity.ai/docs/model-cards are supported
 ## Return citations 
 Perplexity supports returning citations via `return_citations=True`. [Perplexity Docs](https://docs.perplexity.ai/reference/post_chat_completions). Note: Perplexity has this feature in **closed beta**, so you need them to grant you access to get citations from their API. 
 If perplexity returns citations, LiteLLM will pass it straight through. 
 :::info
 For passing more provider-specific, [go here](../completion/provider_specific_params.md)
 :::
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 from litellm import completion
 import os
 os.environ['PERPLEXITYAI_API_KEY'] = ""
 response = completion(
    model="perplexity/mistral-7b-instruct", 
    messages=messages,
    return_citations=True
 )
 print(response)
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 1. Add perplexity to config.yaml
 ```yaml
 model_list:
  - model_name: "perplexity-model"
    litellm_params:
      model: "llama-3.1-sonar-small-128k-online"
      api_key: os.environ/PERPLEXITY_API_KEY
 ```
 2. Start proxy 
 ```bash
 litellm --config /path/to/config.yaml
 ```
 3. Test it! 
 ```bash
 curl -L -X POST 'http://0.0.0.0:4000/chat/completions' \
 -H 'Content-Type: application/json' \
 -H 'Authorization: Bearer sk-1234' \
 -d '{
    "model": "perplexity-model",
    "messages": [
      {
        "role": "user",
        "content": "Who won the world cup in 2022?"
      }
    ],
    "return_citations": true
 }'
 ```
 [**Call w/ OpenAI SDK, Langchain, Instructor, etc.**](../proxy/user_keys.md#chatcompletions)
 </TabItem>
 </Tabs>
--- a/docs/my-website/docs/providers/vertex.md
+++ b/docs/my-website/docs/providers/vertex.md
@ -10,7 +10,7 @@ import TabItem from '@theme/TabItem';
 ## 🆕 `vertex_ai_beta/` route 
-New `vertex_ai_beta/` route. Adds support for system messages, tool_choice params, etc. by moving to httpx client (instead of vertex sdk).
+New `vertex_ai_beta/` route. Adds support for system messages, tool_choice params, etc. by moving to httpx client (instead of vertex sdk). This implementation uses [VertexAI's REST API](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference#syntax).
 ```python
 from litellm import completion
@ -334,6 +334,10 @@ completion(model="vertex_ai_beta/gemini-1.5-flash-preview-0514", messages=messag
 Add Google Search Result grounding to vertex ai calls. 
 [**Relevant VertexAI Docs**](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/grounding#examples)
 See the grounding metadata with `response_obj._hidden_params["vertex_ai_grounding_metadata"]`
 <Tabs>
 <TabItem value="sdk" label="SDK">
@ -357,15 +361,17 @@ print(resp)
 <TabItem value="proxy" label="PROXY">
 ```bash
-curl http://0.0.0.0:4000/v1/chat/completions \
+curl http://localhost:4000/v1/chat/completions \
  -H "Content-Type: application/json" \
-  -H "Authorization: Bearer $OPENAI_API_KEY" \
+  -H "Authorization: Bearer sk-1234" \
  -d '{
-    "model": "gpt-4o",
+    "model": "gemini-pro",
-    "messages": [{"role": "user", "content": "Who won the world cup?"}],
+    "messages": [
-    "tools": [
+      {"role": "user", "content": "Hello, Claude!"}
    ],
   "tools": [
        {
-            "googleSearchResults": {} 
+            "googleSearchRetrieval": {} 
        }
    ]
  }'
@ -375,6 +381,161 @@ curl http://0.0.0.0:4000/v1/chat/completions \
 </TabItem>
 </Tabs>
 #### **Moving from Vertex AI SDK to LiteLLM (GROUNDING)**
 If this was your initial VertexAI Grounding code,
 ```python
 import vertexai 
 vertexai.init(project=project_id, location="us-central1")
 model = GenerativeModel("gemini-1.5-flash-001")
 # Use Google Search for grounding
 tool = Tool.from_google_search_retrieval(grounding.GoogleSearchRetrieval(disable_attributon=False))
 prompt = "When is the next total solar eclipse in US?"
 response = model.generate_content(
    prompt,
    tools=[tool],
    generation_config=GenerationConfig(
        temperature=0.0,
    ),
 )
 print(response)
 ```
 then, this is what it looks like now
 ```python
 from litellm import completion 
 # !gcloud auth application-default login - run this to add vertex credentials to your env
 tools = [{"googleSearchRetrieval": {"disable_attributon": False}}] # 👈 ADD GOOGLE SEARCH
 resp = litellm.completion(
                    model="vertex_ai_beta/gemini-1.0-pro-001",
                    messages=[{"role": "user", "content": "Who won the world cup?"}],
                    tools=tools,
                    vertex_project="project-id"
                )
 print(resp)
 ```
 ### **Context Caching**
 Use Vertex AI Context Caching
 [**Relevant VertexAI Docs**](https://cloud.google.com/vertex-ai/generative-ai/docs/context-cache/context-cache-overview)
 <Tabs>
 <TabItem value="proxy" label="LiteLLM PROXY">
 1. Add model to config.yaml
 ```yaml
 model_list:
  # used for /chat/completions, /completions, /embeddings endpoints
  - model_name: gemini-1.5-pro-001
    litellm_params:
      model: vertex_ai_beta/gemini-1.5-pro-001
      vertex_project: "project-id"
      vertex_location: "us-central1"
      vertex_credentials: "adroit-crow-413218-a956eef1a2a8.json" # Add path to service account.json
 # used for the /cachedContent and vertexAI native endpoints
 default_vertex_config:
  vertex_project: "adroit-crow-413218"
  vertex_location: "us-central1"
  vertex_credentials: "adroit-crow-413218-a956eef1a2a8.json" # Add path to service account.json
 ```
 2. Start Proxy 
 ```
 $ litellm --config /path/to/config.yaml
 ```
 3. Make Request!
 We make the request in two steps:
 - Create a cachedContents object
 - Use the cachedContents object in your /chat/completions 
 **Create a cachedContents object**
 First, create a cachedContents object by calling the Vertex `cachedContents` endpoint. The LiteLLM proxy forwards the `/cachedContents` request to the VertexAI API.
 ```python
 import httpx
 # Set Litellm proxy variables
 LITELLM_BASE_URL = "http://0.0.0.0:4000"
 LITELLM_PROXY_API_KEY = "sk-1234"
 httpx_client = httpx.Client(timeout=30)
 print("Creating cached content")
 create_cache = httpx_client.post(
    url=f"{LITELLM_BASE_URL}/vertex-ai/cachedContents",
    headers={"Authorization": f"Bearer {LITELLM_PROXY_API_KEY}"},
    json={
        "model": "gemini-1.5-pro-001",
        "contents": [
            {
                "role": "user",
                "parts": [{
                    "text": "This is sample text to demonstrate explicit caching." * 4000
                }]
            }
        ],
    }
 )
 print("Response from create_cache:", create_cache)
 create_cache_response = create_cache.json()
 print("JSON from create_cache:", create_cache_response)
 cached_content_name = create_cache_response["name"]
 ```
 **Use the cachedContents object in your /chat/completions request to VertexAI**
 ```python
 import openai
 # Set Litellm proxy variables
 LITELLM_BASE_URL = "http://0.0.0.0:4000"
 LITELLM_PROXY_API_KEY = "sk-1234"
 client = openai.OpenAI(api_key=LITELLM_PROXY_API_KEY, base_url=LITELLM_BASE_URL)
 response = client.chat.completions.create(
    model="gemini-1.5-pro-001",
    max_tokens=8192,
    messages=[
        {
            "role": "user",
            "content": "What is the sample text about?",
        },
    ],
    temperature=0.7,
    extra_body={"cached_content": cached_content_name},  # Use the cached content
 )
 print("Response from proxy:", response)
 ```
 </TabItem>
 </Tabs>
 ## Pre-requisites
 * `pip install google-cloud-aiplatform` (pre-installed on proxy docker image)
 * Authentication: 
@ -697,6 +858,256 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
 </TabItem>
 </Tabs>
 ## Llama 3 API
 | Model Name       | Function Call                        |
 |------------------|--------------------------------------|
 | meta/llama3-405b-instruct-maas   | `completion('vertex_ai/meta/llama3-405b-instruct-maas', messages)` |
 ### Usage
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 from litellm import completion
 import os
 os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = ""
 model = "meta/llama3-405b-instruct-maas"
 vertex_ai_project = "your-vertex-project" # can also set this as os.environ["VERTEXAI_PROJECT"]
 vertex_ai_location = "your-vertex-location" # can also set this as os.environ["VERTEXAI_LOCATION"]
 response = completion(
    model="vertex_ai/" + model,
    messages=[{"role": "user", "content": "hi"}],
    vertex_ai_project=vertex_ai_project,
    vertex_ai_location=vertex_ai_location,
 )
 print("\nModel Response", response)
 ```
 </TabItem>
 <TabItem value="proxy" label="Proxy">
 **1. Add to config**
 ```yaml
 model_list:
    - model_name: anthropic-llama
      litellm_params:
        model: vertex_ai/meta/llama3-405b-instruct-maas
        vertex_ai_project: "my-test-project"
        vertex_ai_location: "us-east-1"
    - model_name: anthropic-llama
      litellm_params:
        model: vertex_ai/meta/llama3-405b-instruct-maas
        vertex_ai_project: "my-test-project"
        vertex_ai_location: "us-west-1"
 ```
 **2. Start proxy**
 ```bash
 litellm --config /path/to/config.yaml
 # RUNNING at http://0.0.0.0:4000
 ```
 **3. Test it!**
 ```bash
 curl --location 'http://0.0.0.0:4000/chat/completions' \
      --header 'Authorization: Bearer sk-1234' \
      --header 'Content-Type: application/json' \
      --data '{
            "model": "anthropic-llama", # 👈 the 'model_name' in config
            "messages": [
                {
                "role": "user",
                "content": "what llm are you"
                }
            ],
        }'
 ```
 </TabItem>
 </Tabs>
 ## Mistral API
 [**Supported OpenAI Params**](https://github.com/BerriAI/litellm/blob/e0f3cd580cb85066f7d36241a03c30aa50a8a31d/litellm/llms/openai.py#L137)
 | Model Name       | Function Call                        |
 |------------------|--------------------------------------|
 | mistral-large@latest   | `completion('vertex_ai/mistral-large@latest', messages)` |
 | mistral-large@2407   | `completion('vertex_ai/mistral-large@2407', messages)` |
 | mistral-nemo@latest   | `completion('vertex_ai/mistral-nemo@latest', messages)` |
 | codestral@latest   | `completion('vertex_ai/codestral@latest', messages)` |
 | codestral@@2405   | `completion('vertex_ai/codestral@2405', messages)` |
 ### Usage
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 from litellm import completion
 import os
 os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = ""
 model = "mistral-large@2407"
 vertex_ai_project = "your-vertex-project" # can also set this as os.environ["VERTEXAI_PROJECT"]
 vertex_ai_location = "your-vertex-location" # can also set this as os.environ["VERTEXAI_LOCATION"]
 response = completion(
    model="vertex_ai/" + model,
    messages=[{"role": "user", "content": "hi"}],
    vertex_ai_project=vertex_ai_project,
    vertex_ai_location=vertex_ai_location,
 )
 print("\nModel Response", response)
 ```
 </TabItem>
 <TabItem value="proxy" label="Proxy">
 **1. Add to config**
 ```yaml
 model_list:
    - model_name: vertex-mistral
      litellm_params:
        model: vertex_ai/mistral-large@2407
        vertex_ai_project: "my-test-project"
        vertex_ai_location: "us-east-1"
    - model_name: vertex-mistral
      litellm_params:
        model: vertex_ai/mistral-large@2407
        vertex_ai_project: "my-test-project"
        vertex_ai_location: "us-west-1"
 ```
 **2. Start proxy**
 ```bash
 litellm --config /path/to/config.yaml
 # RUNNING at http://0.0.0.0:4000
 ```
 **3. Test it!**
 ```bash
 curl --location 'http://0.0.0.0:4000/chat/completions' \
      --header 'Authorization: Bearer sk-1234' \
      --header 'Content-Type: application/json' \
      --data '{
            "model": "vertex-mistral", # 👈 the 'model_name' in config
            "messages": [
                {
                "role": "user",
                "content": "what llm are you"
                }
            ],
        }'
 ```
 </TabItem>
 </Tabs>
 ### Usage - Codestral FIM
 Call Codestral on VertexAI via the OpenAI [`/v1/completion`](https://platform.openai.com/docs/api-reference/completions/create) endpoint for FIM tasks. 
 Note: You can also call Codestral via `/chat/completion`.
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 from litellm import completion
 import os
 # os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = ""
 # OR run `!gcloud auth print-access-token` in your terminal
 model = "codestral@2405"
 vertex_ai_project = "your-vertex-project" # can also set this as os.environ["VERTEXAI_PROJECT"]
 vertex_ai_location = "your-vertex-location" # can also set this as os.environ["VERTEXAI_LOCATION"]
 response = text_completion(
    model="vertex_ai/" + model,
    vertex_ai_project=vertex_ai_project,
    vertex_ai_location=vertex_ai_location,
    prompt="def is_odd(n): \n return n % 2 == 1 \ndef test_is_odd():", 
    suffix="return True",                                              # optional
    temperature=0,                                                     # optional
    top_p=1,                                                           # optional
    max_tokens=10,                                                     # optional
    min_tokens=10,                                                     # optional
    seed=10,                                                           # optional
    stop=["return"],                                                   # optional
 )
 print("\nModel Response", response)
 ```
 </TabItem>
 <TabItem value="proxy" label="Proxy">
 **1. Add to config**
 ```yaml
 model_list:
    - model_name: vertex-codestral
      litellm_params:
        model: vertex_ai/codestral@2405
        vertex_ai_project: "my-test-project"
        vertex_ai_location: "us-east-1"
    - model_name: vertex-codestral
      litellm_params:
        model: vertex_ai/codestral@2405
        vertex_ai_project: "my-test-project"
        vertex_ai_location: "us-west-1"
 ```
 **2. Start proxy**
 ```bash
 litellm --config /path/to/config.yaml
 # RUNNING at http://0.0.0.0:4000
 ```
 **3. Test it!**
 ```bash
 curl -X POST 'http://0.0.0.0:4000/completions' \
      -H 'Authorization: Bearer sk-1234' \
      -H 'Content-Type: application/json' \
      -d '{
            "model": "vertex-codestral", # 👈 the 'model_name' in config
            "prompt": "def is_odd(n): \n return n % 2 == 1 \ndef test_is_odd():", 
            "suffix":"return True",                                              # optional
            "temperature":0,                                                     # optional
            "top_p":1,                                                           # optional
            "max_tokens":10,                                                     # optional
            "min_tokens":10,                                                     # optional
            "seed":10,                                                           # optional
            "stop":["return"],                                                   # optional
        }'
 ```
 </TabItem>
 </Tabs>
 ## Model Garden
 | Model Name       | Function Call                        |
 |------------------|--------------------------------------|
--- a/docs/my-website/docs/proxy/alerting.md
+++ b/docs/my-website/docs/proxy/alerting.md
@ -119,13 +119,14 @@ All Possible Alert Types
 ```python
 AlertType = Literal[
-    "llm_exceptions",
+    "llm_exceptions",        # LLM API Exceptions
-    "llm_too_slow",
+    "llm_too_slow",          # LLM Responses slower than alerting_threshold
    "llm_requests_hanging",
    "budget_alerts",
    "db_exceptions",
    "daily_reports",
    "spend_reports",
    "fallback_reports",
    "cooldown_deployment",
    "new_model_added",
    "outage_alerts",
@ -133,6 +134,61 @@ AlertType = Literal[
 ```
 ## Advanced - set specific slack channels per alert type
 Use this if you want to set specific channels per alert type
 **This allows you to do the following**
 ```
 llm_exceptions -> go to slack channel #llm-exceptions
 spend_reports -> go to slack channel #llm-spend-reports
 ```
 Set `alert_to_webhook_url` on your config.yaml
 ```yaml
 model_list:
  - model_name: gpt-4
    litellm_params:
      model: openai/fake
      api_key: fake-key
      api_base: https://exampleopenaiendpoint-production.up.railway.app/
 general_settings: 
  master_key: sk-1234
  alerting: ["slack"]
  alerting_threshold: 0.0001 # (Seconds) set an artifically low threshold for testing alerting
  alert_to_webhook_url: {
    "llm_exceptions": "https://hooks.slack.com/services/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",
    "llm_too_slow": "https://hooks.slack.com/services/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",
    "llm_requests_hanging": "https://hooks.slack.com/services/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",
    "budget_alerts": "https://hooks.slack.com/services/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",
    "db_exceptions": "https://hooks.slack.com/services/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",
    "daily_reports": "https://hooks.slack.com/services/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",
    "spend_reports": "https://hooks.slack.com/services/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",
    "cooldown_deployment": "https://hooks.slack.com/services/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",
    "new_model_added": "https://hooks.slack.com/services/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",
    "outage_alerts": "https://hooks.slack.com/services/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",
  }
 litellm_settings:
  success_callback: ["langfuse"]
 ```
 Test it - send a valid llm request - expect to see a `llm_too_slow` alert in it's own slack channel
 ```shell
 curl -i http://localhost:4000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer sk-1234" \
  -d '{
    "model": "gpt-4",
    "messages": [
      {"role": "user", "content": "Hello, Claude gm!"}
    ]
 }'
 ```
 ## Advanced - Using MS Teams Webhooks
--- a/docs/my-website/docs/proxy/billing.md
+++ b/docs/my-website/docs/proxy/billing.md
@ -2,7 +2,7 @@ import Image from '@theme/IdealImage';
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
-# 💵 Billing
+# Billing
 Bill internal teams, external customers for their usage
--- a/docs/my-website/docs/proxy/bucket.md
+++ b/docs/my-website/docs/proxy/bucket.md
@ -0,0 +1,191 @@
 import Image from '@theme/IdealImage';
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # Logging GCS, s3 Buckets
 LiteLLM Supports Logging to the following Cloud Buckets
 - (Enterprise) ✨ [Google Cloud Storage Buckets](#logging-proxy-inputoutput-to-google-cloud-storage-buckets)
 - (Free OSS) [Amazon s3 Buckets](#logging-proxy-inputoutput---s3-buckets) 
 ## Logging Proxy Input/Output to Google Cloud Storage Buckets
 Log LLM Logs to [Google Cloud Storage Buckets](https://cloud.google.com/storage?hl=en)
 :::info
 ✨ This is an Enterprise only feature [Get Started with Enterprise here](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
 :::
 ### Usage
 1. Add `gcs_bucket` to LiteLLM Config.yaml
 ```yaml
 model_list:
 - litellm_params:
    api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
    api_key: my-fake-key
    model: openai/my-fake-model
  model_name: fake-openai-endpoint
 litellm_settings:
  callbacks: ["gcs_bucket"] # 👈 KEY CHANGE # 👈 KEY CHANGE
 ```
 2. Set required env variables
 ```shell
 GCS_BUCKET_NAME="<your-gcs-bucket-name>"
 GCS_PATH_SERVICE_ACCOUNT="/Users/ishaanjaffer/Downloads/adroit-crow-413218-a956eef1a2a8.json" # Add path to service account.json
 ```
 3. Start Proxy
 ```
 litellm --config /path/to/config.yaml
 ```
 4. Test it! 
 ```bash
 curl --location 'http://0.0.0.0:4000/chat/completions' \
 --header 'Content-Type: application/json' \
 --data ' {
      "model": "fake-openai-endpoint",
      "messages": [
        {
          "role": "user",
          "content": "what llm are you"
        }
      ],
    }
 '
 ```
 ### Expected Logs on GCS Buckets
 <Image img={require('../../img/gcs_bucket.png')} />
 ### Fields Logged on GCS Buckets
 Example payload of a `/chat/completion` request logged on GCS
 ```json
 {
  "request_kwargs": {
    "model": "gpt-3.5-turbo",
    "messages": [
      {
        "role": "user",
        "content": "This is a test"
      }
    ],
    "optional_params": {
      "temperature": 0.7,
      "max_tokens": 10,
      "user": "ishaan-2",
      "extra_body": {}
    }
  },
  "response_obj": {
    "id": "chatcmpl-bd836a8c-89bc-4abd-bee5-e3f1ebfdb541",
    "choices": [
      {
        "finish_reason": "stop",
        "index": 0,
        "message": {
          "content": "Hi!",
          "role": "assistant",
          "tool_calls": null,
          "function_call": null
        }
      }
    ],
    "created": 1722868456,
    "model": "gpt-3.5-turbo",
    "object": "chat.completion",
    "system_fingerprint": null,
    "usage": {
      "prompt_tokens": 10,
      "completion_tokens": 20,
      "total_tokens": 30
    }
  },
  "start_time": "2024-08-05 07:34:16",
  "end_time": "2024-08-05 07:34:16"
 }
 ```
 ### Getting `service_account.json` from Google Cloud Console
 1. Go to [Google Cloud Console](https://console.cloud.google.com/)
 2. Search for IAM & Admin
 3. Click on Service Accounts
 4. Select a Service Account
 5. Click on 'Keys' -> Add Key -> Create New Key -> JSON
 6. Save the JSON file and add the path to `GCS_PATH_SERVICE_ACCOUNT`
 ## Logging Proxy Input/Output - s3 Buckets
 We will use the `--config` to set 
 - `litellm.success_callback = ["s3"]` 
 This will log all successfull LLM calls to s3 Bucket
 **Step 1** Set AWS Credentials in .env
 ```shell
 AWS_ACCESS_KEY_ID = ""
 AWS_SECRET_ACCESS_KEY = ""
 AWS_REGION_NAME = ""
 ```
 **Step 2**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
 ```yaml
 model_list:
 - model_name: gpt-3.5-turbo
    litellm_params:
      model: gpt-3.5-turbo
 litellm_settings:
  success_callback: ["s3"]
  s3_callback_params:
    s3_bucket_name: logs-bucket-litellm   # AWS Bucket Name for S3
    s3_region_name: us-west-2              # AWS Region Name for S3
    s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID  # us os.environ/<variable name> to pass environment variables. This is AWS Access Key ID for S3
    s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY  # AWS Secret Access Key for S3
    s3_path: my-test-path # [OPTIONAL] set path in bucket you want to write logs to
    s3_endpoint_url: https://s3.amazonaws.com  # [OPTIONAL] S3 endpoint URL, if you want to use Backblaze/cloudflare s3 buckets
 ```
 **Step 3**: Start the proxy, make a test request
 Start proxy
 ```shell
 litellm --config config.yaml --debug
 ```
 Test Request
 ```shell
 curl --location 'http://0.0.0.0:4000/chat/completions' \
    --header 'Content-Type: application/json' \
    --data ' {
    "model": "Azure OpenAI GPT-4 East",
    "messages": [
        {
        "role": "user",
        "content": "what llm are you"
        }
    ]
    }'
 ```
 Your logs should be available on the specified s3 Bucket
--- a/docs/my-website/docs/proxy/caching.md
+++ b/docs/my-website/docs/proxy/caching.md
@ -59,6 +59,8 @@ litellm_settings:
  cache_params:        # set cache params for redis
    type: redis
    ttl: 600 # will be cached on redis for 600s
    # default_in_memory_ttl: Optional[float], default is None. time in seconds. 
    # default_in_redis_ttl: Optional[float], default is None. time in seconds. 
 ```
@ -258,6 +260,21 @@ curl --location 'http://0.0.0.0:4000/cache/ping'  -H "Authorization: Bearer sk-1
 ```
 ## Advanced
 ### Control Call Types Caching is on for - (`/chat/completion`, `/embeddings`, etc.)
 By default, caching is on for all call types. You can control which call types caching is on for by setting `supported_call_types` in `cache_params`
 **Cache will only be on for the call types specified in `supported_call_types`**
 ```yaml
 litellm_settings:
  cache: True
  cache_params:
    type: redis
    supported_call_types: ["acompletion", "atext_completion", "aembedding", "atranscription"]
                          # /chat/completions, /completions, /embeddings, /audio/transcriptions
 ```
 ### Set Cache Params on config.yaml
 ```yaml
 model_list:
@ -278,7 +295,8 @@ litellm_settings:
    password: "your_password"  # The password for the Redis cache. Required if type is "redis".
    # Optional configurations
-    supported_call_types: ["acompletion", "completion", "embedding", "aembedding"] # defaults to all litellm call types
+    supported_call_types: ["acompletion", "atext_completion", "aembedding", "atranscription"]
                      # /chat/completions, /completions, /embeddings, /audio/transcriptions
 ```
 ### Turn on / off caching per request.  
@ -294,6 +312,11 @@ The proxy support 4 cache-controls:
 **Turn off caching**
 Set `no-cache=True`, this will not return a cached response
 <Tabs>
 <TabItem value="openai" label="OpenAI Python SDK">
 ```python
 import os
 from openai import OpenAI
@ -319,9 +342,81 @@ chat_completion = client.chat.completions.create(
    }
 )
 ```
 </TabItem>
 <TabItem value="curl" label="curl">
 ```shell
 curl http://localhost:4000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer sk-1234" \
  -d '{
    "model": "gpt-3.5-turbo",
    "cache": {"no-cache": True},
    "messages": [
      {"role": "user", "content": "Say this is a test"}
    ]
  }'
 ```
 </TabItem>
 </Tabs>
 **Turn on caching**
 By default cache is always on
 <Tabs>
 <TabItem value="openai" label="OpenAI Python SDK">
 ```python
 import os
 from openai import OpenAI
 client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI_API_KEY"),
 		base_url="http://0.0.0.0:4000"
 )
 chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "Say this is a test",
        }
    ],
    model="gpt-3.5-turbo"
 )
 ```
 </TabItem>
 <TabItem value="curl on" label="curl">
 ```shell
 curl http://localhost:4000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer sk-1234" \
  -d '{
    "model": "gpt-3.5-turbo",
    "messages": [
      {"role": "user", "content": "Say this is a test"}
    ]
  }'
 ```
 </TabItem>
 </Tabs>
 **Set `ttl`**
 Set `ttl=600`, this will caches response for 10 minutes (600 seconds)
 <Tabs>
 <TabItem value="openai" label="OpenAI Python SDK">
 ```python
 import os
 from openai import OpenAI
@ -347,6 +442,35 @@ chat_completion = client.chat.completions.create(
    }
 )
 ```
 </TabItem>
 <TabItem value="curl on" label="curl">
 ```shell
 curl http://localhost:4000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer sk-1234" \
  -d '{
    "model": "gpt-3.5-turbo",
    "cache": {"ttl": 600},
    "messages": [
      {"role": "user", "content": "Say this is a test"}
    ]
  }'
 ```
 </TabItem>
 </Tabs>
 **Set `s-maxage`**
 Set `s-maxage`, this will only get responses cached within last 10 minutes 
 <Tabs>
 <TabItem value="openai" label="OpenAI Python SDK">
 ```python
 import os
@ -373,6 +497,27 @@ chat_completion = client.chat.completions.create(
    }
 )
 ```
 </TabItem>
 <TabItem value="curl on" label="curl">
 ```shell
 curl http://localhost:4000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer sk-1234" \
  -d '{
    "model": "gpt-3.5-turbo",
    "cache": {"s-maxage": 600},
    "messages": [
      {"role": "user", "content": "Say this is a test"}
    ]
  }'
 ```
 </TabItem>
 </Tabs>
 ### Turn on / off caching per Key.
@ -486,21 +631,25 @@ litellm_settings:
 ```yaml
 cache_params:
  # ttl 
  ttl: Optional[float]
  default_in_memory_ttl: Optional[float]
  default_in_redis_ttl: Optional[float]
  # Type of cache (options: "local", "redis", "s3")
  type: s3
  # List of litellm call types to cache for
  # Options: "completion", "acompletion", "embedding", "aembedding"
-  supported_call_types:
+  supported_call_types: ["acompletion", "atext_completion", "aembedding", "atranscription"]
-    - completion
+                      # /chat/completions, /completions, /embeddings, /audio/transcriptions
    - acompletion
    - embedding
    - aembedding
  # Redis cache parameters
  host: localhost  # Redis server hostname or IP address
  port: "6379"  # Redis server port (as a string)
  password: secret_password  # Redis server password
  namespace: Optional[str] = None,
  # S3 cache parameters
  s3_bucket_name: your_s3_bucket_name  # Name of the S3 bucket
--- a/docs/my-website/docs/proxy/call_hooks.md
+++ b/docs/my-website/docs/proxy/call_hooks.md
@ -47,6 +47,7 @@ class MyCustomHandler(CustomLogger): # https://docs.litellm.ai/docs/observabilit
    async def async_post_call_success_hook(
        self,
        data: dict,
        user_api_key_dict: UserAPIKeyAuth,
        response,
    ):
--- a/docs/my-website/docs/proxy/configs.md
+++ b/docs/my-website/docs/proxy/configs.md
@ -55,11 +55,19 @@ model_list:
  - model_name: vllm-models
    litellm_params:
      model: openai/facebook/opt-125m # the `openai/` prefix tells litellm it's openai compatible
-      api_base: http://0.0.0.0:4000
+      api_base: http://0.0.0.0:4000/v1
      api_key: none
      rpm: 1440
    model_info: 
      version: 2
  # Use this if you want to make requests to `claude-3-haiku-20240307`,`claude-3-opus-20240229`,`claude-2.1` without defining them on the config.yaml
  # Default models
  # Works for ALL Providers and needs the default provider credentials in .env
  - model_name: "*" 
    litellm_params:
      model: "*"
 litellm_settings: # module level litellm settings - https://github.com/BerriAI/litellm/blob/main/litellm/__init__.py
  drop_params: True
  success_callback: ["langfuse"] # OPTIONAL - if you want to start sending LLM Logs to Langfuse. Make sure to set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` in your env
@ -277,52 +285,58 @@ curl --location 'http://0.0.0.0:4000/v1/model/info' \
 --data ''
 ```
 ## Wildcard Model Name (Add ALL MODELS from env)
-Dynamically call any model from any given provider without the need to predefine it in the config YAML file. As long as the relevant keys are in the environment (see [providers list](../providers/)), LiteLLM will make the call correctly.
+## Provider specific wildcard routing 
 **Proxy all models from a provider**
 Use this if you want to **proxy all models from a specific provider without defining them on the config.yaml**
-
+**Step 1** - define provider specific routing on config.yaml
-1. Setup config.yaml
+```yaml
 ```
 model_list:
-  - model_name: "*"             # all requests where model not in your config go to this deployment
+  # provider specific wildcard routing
  - model_name: "anthropic/*"
    litellm_params:
-      model: "openai/*"           # passes our validation check that a real provider is given
+      model: "anthropic/*"
      api_key: os.environ/ANTHROPIC_API_KEY
  - model_name: "groq/*"
    litellm_params:
      model: "groq/*"
      api_key: os.environ/GROQ_API_KEY
 ```
-2. Start LiteLLM proxy 
+Step 2 - Run litellm proxy 
-```
+```shell
-litellm --config /path/to/config.yaml
+$ litellm --config /path/to/config.yaml
 ```
-3. Try claude 3-5 sonnet from anthropic 
+Step 3 Test it 
-```bash
+Test with `anthropic/` - all models with `anthropic/` prefix will get routed to `anthropic/*`
-curl -X POST 'http://0.0.0.0:4000/chat/completions' \
+```shell
-H 'Content-Type: application/json' \
+curl http://localhost:4000/v1/chat/completions \
-H 'Authorization: Bearer sk-1234' \
+  -H "Content-Type: application/json" \
-D '{
+  -H "Authorization: Bearer sk-1234" \
-  "model": "claude-3-5-sonnet-20240620",
+  -d '{
-  "messages": [
+    "model": "anthropic/claude-3-sonnet-20240229",
-        {"role": "user", "content": "Hey, how'\''s it going?"},
+    "messages": [
-        {
+      {"role": "user", "content": "Hello, Claude!"}
            "role": "assistant",
            "content": "I'\''m doing well. Would like to hear the rest of the story?"
        },
        {"role": "user", "content": "Na"},
        {
            "role": "assistant",
            "content": "No problem, is there anything else i can help you with today?"
        },
        {
            "role": "user",
            "content": "I think you'\''re getting cut off sometimes"
        }
    ]
-}
+  }'
-'
+```
 Test with `groq/` - all models with `groq/` prefix will get routed to `groq/*`
 ```shell
 curl http://localhost:4000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer sk-1234" \
  -d '{
    "model": "groq/llama3-8b-8192",
    "messages": [
      {"role": "user", "content": "Hello, Claude!"}
    ]
  }'
 ```
 ## Load Balancing 
--- a/docs/my-website/docs/proxy/custom_pricing.md
+++ b/docs/my-website/docs/proxy/custom_pricing.md
@ -1,6 +1,6 @@
 import Image from '@theme/IdealImage';
-# Custom Pricing - Sagemaker, etc. 
+# Custom LLM Pricing - Sagemaker, Azure, etc
 Use this to register custom pricing for models. 
@ -16,39 +16,9 @@ LiteLLM already has pricing for any model in our [model cost map](https://github
 :::
-## Quick Start 
+## Cost Per Second (e.g. Sagemaker)
-Register custom pricing for sagemaker completion model. 
+### Usage with LiteLLM Proxy Server
 For cost per second pricing, you **just** need to register `input_cost_per_second`. 
 ```python
 # !pip install boto3 
 from litellm import completion, completion_cost 
 os.environ["AWS_ACCESS_KEY_ID"] = ""
 os.environ["AWS_SECRET_ACCESS_KEY"] = ""
 os.environ["AWS_REGION_NAME"] = ""
 def test_completion_sagemaker():
    try:
        print("testing sagemaker")
        response = completion(
            model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4",
            messages=[{"role": "user", "content": "Hey, how's it going?"}],
            input_cost_per_second=0.000420,
        )
        # Add any assertions here to check the response
        print(response)
        cost = completion_cost(completion_response=response)
        print(cost)
    except Exception as e:
        raise Exception(f"Error occurred: {e}")
 ```
 ### Usage with OpenAI Proxy Server
 **Step 1: Add pricing to config.yaml**
 ```yaml
@ -75,38 +45,7 @@ litellm /path/to/config.yaml
 ## Cost Per Token (e.g. Azure)
-
+### Usage with LiteLLM Proxy Server
 ```python
 # !pip install boto3 
 from litellm import completion, completion_cost 
 ## set ENV variables
 os.environ["AZURE_API_KEY"] = ""
 os.environ["AZURE_API_BASE"] = ""
 os.environ["AZURE_API_VERSION"] = ""
 def test_completion_azure_model():
    try:
        print("testing azure custom pricing")
        # azure call
        response = completion(
          model = "azure/<your_deployment_name>", 
          messages = [{ "content": "Hello, how are you?","role": "user"}]
          input_cost_per_token=0.005,
          output_cost_per_token=1,
        )
        # Add any assertions here to check the response
        print(response)
        cost = completion_cost(completion_response=response)
        print(cost)
    except Exception as e:
        raise Exception(f"Error occurred: {e}")
 test_completion_azure_model()
 ```
 ### Usage with OpenAI Proxy Server
 ```yaml
 model_list:
--- a/docs/my-website/docs/proxy/customers.md
+++ b/docs/my-website/docs/proxy/customers.md
@ -231,7 +231,7 @@ curl -X POST 'http://localhost:4000/customer/new' \
 ```python
 from openai import OpenAI
 client = OpenAI(
-  base_url="<your_proxy_base_url",
+  base_url="<your_proxy_base_url>",
  api_key="<your_proxy_key>"
 )
--- a/docs/my-website/docs/proxy/debugging.md
+++ b/docs/my-website/docs/proxy/debugging.md
@ -35,6 +35,22 @@ $ litellm --detailed_debug
 os.environ["LITELLM_LOG"] = "DEBUG"
 ```
 ### Debug Logs 
 Run the proxy with `--detailed_debug` to view detailed debug logs
 ```shell
 litellm --config /path/to/config.yaml --detailed_debug
 ```
 When making requests you should see the POST request sent by LiteLLM to the LLM on the Terminal output
 ```shell
 POST Request Sent from LiteLLM:
 curl -X POST \
 https://api.openai.com/v1/chat/completions \
 -H 'content-type: application/json' -H 'Authorization: Bearer sk-qnWGUIW9****************************************' \
 -d '{"model": "gpt-3.5-turbo", "messages": [{"role": "user", "content": "this is a test request, write a short poem"}]}'
 ```
 ## JSON LOGS
 Set `JSON_LOGS="True"` in your env:
--- a/docs/my-website/docs/proxy/deploy.md
+++ b/docs/my-website/docs/proxy/deploy.md
@ -17,8 +17,15 @@ git clone https://github.com/BerriAI/litellm
 # Go to folder
 cd litellm
-# Add the master key
+# Add the master key - you can change this after setup
 echo 'LITELLM_MASTER_KEY="sk-1234"' > .env
 # Add the litellm salt key - you cannot change this after adding a model
 # It is used to encrypt / decrypt your LLM API Key credentials
 # We recommned - https://1password.com/password-generator/ 
 # password generator to get a random hash for litellm salt key
 echo 'LITELLM_SALT_KEY="sk-1234"' > .env
 source .env
 # Start
@ -239,7 +246,7 @@ helm install lite-helm ./litellm-helm
 kubectl --namespace default port-forward $POD_NAME 8080:$CONTAINER_PORT
 ```
-Your OpenAI proxy server is now running on `http://127.0.0.1:4000`.
+Your LiteLLM Proxy Server is now running on `http://127.0.0.1:4000`.
 </TabItem>
@ -247,6 +254,15 @@ Your OpenAI proxy server is now running on `http://127.0.0.1:4000`.
 **That's it ! That's the quick start to deploy litellm**
 ## Use with Langchain, OpenAI SDK, LlamaIndex, Instructor, Curl
 :::info
 💡 Go here 👉 [to make your first LLM API Request](user_keys)
 LiteLLM is compatible with several SDKs - including OpenAI SDK, Anthropic SDK, Mistral SDK, LLamaIndex, Langchain (Js, Python)
 :::
 ## Options to deploy LiteLLM 
 | Docs | When to Use |
@ -285,7 +301,7 @@ docker run \
    --config /app/config.yaml --detailed_debug
 ```
-Your OpenAI proxy server is now running on `http://0.0.0.0:4000`.
+Your LiteLLM Proxy Server is now running on `http://0.0.0.0:4000`.
 </TabItem>
 <TabItem value="kubernetes-deploy" label="Kubernetes">
@ -383,7 +399,7 @@ kubectl apply -f /path/to/service.yaml
 kubectl port-forward service/litellm-service 4000:4000
 ```
-Your OpenAI proxy server is now running on `http://0.0.0.0:4000`.
+Your LiteLLM Proxy Server is now running on `http://0.0.0.0:4000`.
 </TabItem>
@ -425,7 +441,7 @@ kubectl \
  4000:4000
 ```
-Your OpenAI proxy server is now running on `http://127.0.0.1:4000`.
+Your LiteLLM Proxy Server is now running on `http://127.0.0.1:4000`.
 If you need to set your litellm proxy config.yaml, you can find this in [values.yaml](https://github.com/BerriAI/litellm/blob/main/deploy/charts/litellm-helm/values.yaml)
@ -470,7 +486,7 @@ helm install lite-helm ./litellm-helm
 kubectl --namespace default port-forward $POD_NAME 8080:$CONTAINER_PORT
 ```
-Your OpenAI proxy server is now running on `http://127.0.0.1:4000`.
+Your LiteLLM Proxy Server is now running on `http://127.0.0.1:4000`.
 </TabItem>
 </Tabs>
@ -542,6 +558,39 @@ docker run --name litellm-proxy \
 ghcr.io/berriai/litellm-database:main-latest --config your_config.yaml
 ```
 ## LiteLLM without Internet Connection
 By default `prisma generate` downloads [prisma's engine binaries](https://www.prisma.io/docs/orm/reference/environment-variables-reference#custom-engine-file-locations). This might cause errors when running without internet connection. 
 Use this dockerfile to build an image which pre-generates the prisma binaries.
 ```Dockerfile
 # Use the provided base image
 FROM ghcr.io/berriai/litellm:main-latest
 # Set the working directory to /app
 WORKDIR /app
 ### [👇 KEY STEP] ###
 # Install Prisma CLI and generate Prisma client
 RUN pip install prisma 
 RUN prisma generate
 ### FIN #### 
 # Expose the necessary port
 EXPOSE 4000
 # Override the CMD instruction with your desired command and arguments
 # WARNING: FOR PROD DO NOT USE `--detailed_debug` it slows down response times, instead use the following CMD
 # CMD ["--port", "4000", "--config", "config.yaml"]
 # Define the command to run your app
 ENTRYPOINT ["litellm"]
 CMD ["--port", "4000"]
 ```
 ## Advanced Deployment Settings
 ### 1. Customization of the server root path (custom Proxy base url)
@ -556,24 +605,87 @@ In a Kubernetes deployment, it's possible to utilize a shared DNS to host multip
 Customize the root path to eliminate the need for employing multiple DNS configurations during deployment.
 Step 1.
 👉 Set `SERVER_ROOT_PATH` in your .env and this will be set as your server root path
 ```
 export SERVER_ROOT_PATH="/api/v1"
 ```
-**Step 1. Run Proxy with `SERVER_ROOT_PATH` set in your env **
+**Step 2** (If you want the Proxy Admin UI to work with your root path you need to use this dockerfile)
 - Use the dockerfile below (it uses litellm as a base image)
 - 👉 Set `UI_BASE_PATH=$SERVER_ROOT_PATH/ui` in the Dockerfile, example `UI_BASE_PATH=/api/v1/ui`
 Dockerfile
 ```shell
-docker run --name litellm-proxy \
+# Use the provided base image
-e DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname> \
+FROM ghcr.io/berriai/litellm:main-latest
-e SERVER_ROOT_PATH="/api/v1" \
+
-p 4000:4000 \
+# Set the working directory to /app
-ghcr.io/berriai/litellm-database:main-latest --config your_config.yaml
+WORKDIR /app
 # Install Node.js and npm (adjust version as needed)
 RUN apt-get update && apt-get install -y nodejs npm
 # Copy the UI source into the container
 COPY ./ui/litellm-dashboard /app/ui/litellm-dashboard
 # Set an environment variable for UI_BASE_PATH
 # This can be overridden at build time
 # set UI_BASE_PATH to "<your server root path>/ui"
 # 👇👇 Enter your UI_BASE_PATH here
 ENV UI_BASE_PATH="/api/v1/ui" 
 # Build the UI with the specified UI_BASE_PATH
 WORKDIR /app/ui/litellm-dashboard
 RUN npm install
 RUN UI_BASE_PATH=$UI_BASE_PATH npm run build
 # Create the destination directory
 RUN mkdir -p /app/litellm/proxy/_experimental/out
 # Move the built files to the appropriate location
 # Assuming the build output is in ./out directory
 RUN rm -rf /app/litellm/proxy/_experimental/out/* && \
    mv ./out/* /app/litellm/proxy/_experimental/out/
 # Switch back to the main app directory
 WORKDIR /app
 # Make sure your entrypoint.sh is executable
 RUN chmod +x entrypoint.sh
 # Expose the necessary port
 EXPOSE 4000/tcp
 # Override the CMD instruction with your desired command and arguments
 # only use --detailed_debug for debugging
 CMD ["--port", "4000", "--config", "config.yaml"]
 ```
 **Step 3** build this Dockerfile
 ```shell
 docker build -f Dockerfile -t litellm-prod-build . --progress=plain
 ```
 **Step 4. Run Proxy with `SERVER_ROOT_PATH` set in your env **
 ```shell
 docker run \
    -v $(pwd)/proxy_config.yaml:/app/config.yaml \
    -p 4000:4000 \
    -e LITELLM_LOG="DEBUG"\
    -e SERVER_ROOT_PATH="/api/v1"\
    -e DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname> \
    -e LITELLM_MASTER_KEY="sk-1234"\
    litellm-prod-build \
    --config /app/config.yaml
 ```
 After running the proxy you can access it on `http://0.0.0.0:4000/api/v1/` (since we set `SERVER_ROOT_PATH="/api/v1"`)
-**Step 2. Verify Running on correct path**
+**Step 5. Verify Running on correct path**
 <Image img={require('../../img/custom_root_path.png')} />
@ -593,6 +705,29 @@ docker run ghcr.io/berriai/litellm:main-latest \
 Provide an ssl certificate when starting litellm proxy server 
 ### 3. Providing LiteLLM config.yaml file as a s3 Object/url
 Use this if you cannot mount a config file on your deployment service (example - AWS Fargate, Railway etc)
 LiteLLM Proxy will read your config.yaml from an s3 Bucket
 Set the following .env vars 
 ```shell
 LITELLM_CONFIG_BUCKET_NAME = "litellm-proxy"                    # your bucket name on s3 
 LITELLM_CONFIG_BUCKET_OBJECT_KEY = "litellm_proxy_config.yaml"  # object key on s3
 ```
 Start litellm proxy with these env vars - litellm will read your config from s3 
 ```shell
 docker run --name litellm-proxy \
   -e DATABASE_URL=<database_url> \
   -e LITELLM_CONFIG_BUCKET_NAME=<bucket_name> \
   -e LITELLM_CONFIG_BUCKET_OBJECT_KEY="<object_key>> \
   -p 4000:4000 \
   ghcr.io/berriai/litellm-database:main-latest
 ```
 ## Platform-specific Guide
 <Tabs>
@ -778,3 +913,31 @@ Run the command `docker-compose up` or `docker compose up` as per your docker in
 Your LiteLLM container should be running now on the defined port e.g. `4000`.
 ### IAM-based Auth for RDS DB 
 1. Set AWS env var 
 ```bash
 export AWS_WEB_IDENTITY_TOKEN='/path/to/token'
 export AWS_ROLE_NAME='arn:aws:iam::123456789012:role/MyRole'
 export AWS_SESSION_NAME='MySession'
 ```
 [**See all Auth options**](https://github.com/BerriAI/litellm/blob/089a4f279ad61b7b3e213d8039fb9b75204a7abc/litellm/proxy/auth/rds_iam_token.py#L165)
 2. Add RDS credentials to env
 ```bash
 export DATABASE_USER="db-user"
 export DATABASE_PORT="5432"
 export DATABASE_HOST="database-1-instance-1.cs1ksmwz2xt3.us-west-2.rds.amazonaws.com"
 export DATABASE_NAME="database-1-instance-1"
 ```
 3. Run proxy with iam+rds
 ```bash
 litellm --config /path/to/config.yaml --iam_token_db_auth
 ```
--- a/docs/my-website/docs/proxy/email.md
+++ b/docs/my-website/docs/proxy/email.md
@ -1,6 +1,6 @@
 import Image from '@theme/IdealImage';
-# ✨ 📧 Email Notifications 
+# Email Notifications 
 Send an Email to your users when:
 - A Proxy API Key is created for them 
--- a/docs/my-website/docs/proxy/enterprise.md
+++ b/docs/my-website/docs/proxy/enterprise.md
@ -21,15 +21,22 @@ Features:
    - ✅ IP address‑based access control lists
    - ✅ Track Request IP Address
    - ✅ [Use LiteLLM keys/authentication on Pass Through Endpoints](pass_through#✨-enterprise---use-litellm-keysauthentication-on-pass-through-endpoints)
    - ✅ [Set Max Request Size / File Size on Requests](#set-max-request--response-size-on-litellm-proxy)
    - ✅ [Enforce Required Params for LLM Requests (ex. Reject requests missing ["metadata"]["generation_name"])](#enforce-required-params-for-llm-requests)
- **Spend Tracking**
+- **Customize Logging, Guardrails, Caching per project**
    - ✅ [Team Based Logging](./team_logging.md) - Allow each team to use their own Langfuse Project / custom callbacks
    - ✅ [Disable Logging for a Team](./team_logging.md#disable-logging-for-a-team) - Switch off all logging for a team/project (GDPR Compliance)
 -- **Spend Tracking & Data Exports**
    - ✅ [Tracking Spend for Custom Tags](#tracking-spend-for-custom-tags)
-    - ✅ [API Endpoints to get Spend Reports per Team, API Key, Customer](cost_tracking.md#✨-enterprise-api-endpoints-to-get-spend)
+    - ✅ [Exporting LLM Logs to GCS Bucket](./proxy/bucket#🪣-logging-gcs-s3-buckets)
- **Advanced Metrics**
+    - ✅ [`/spend/report` API endpoint](cost_tracking.md#✨-enterprise-api-endpoints-to-get-spend)
 - **Prometheus Metrics**
    - ✅ [Prometheus Metrics - Num Requests, failures, LLM Provider Outages](prometheus)
    - ✅ [`x-ratelimit-remaining-requests`, `x-ratelimit-remaining-tokens` for LLM APIs on Prometheus](prometheus#✨-enterprise-llm-remaining-requests-and-remaining-tokens)
 - **Guardrails, PII Masking, Content Moderation**
    - ✅ [Content Moderation with LLM Guard, LlamaGuard, Secret Detection, Google Text Moderations](#content-moderation)
    - ✅ [Prompt Injection Detection (with LakeraAI API)](#prompt-injection-detection---lakeraai)
    - ✅ [Prompt Injection Detection (with Aporia API)](#prompt-injection-detection---aporia-ai)
    - ✅ [Switch LakeraAI on / off per request](guardrails#control-guardrails-onoff-per-request)
    - ✅ Reject calls from Blocked User list 
    - ✅ Reject calls (incoming / outgoing) with Banned Keywords (e.g. competitors)
@ -113,7 +120,7 @@ client = openai.OpenAI(
    base_url="http://0.0.0.0:4000"
 )
-# request sent to model set on litellm proxy, `litellm --model`
+
 response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages = [
@ -124,7 +131,7 @@ response = client.chat.completions.create(
    ],
    extra_body={
        "metadata": {
-            "tags": ["model-anthropic-claude-v2.1", "app-ishaan-prod"]
+            "tags": ["model-anthropic-claude-v2.1", "app-ishaan-prod"] # 👈 Key Change
        }
    }
 )
@ -133,6 +140,43 @@ print(response)
 ```
 </TabItem>
 <TabItem value="openai js" label="OpenAI JS">
 ```js
 const openai = require('openai');
 async function runOpenAI() {
  const client = new openai.OpenAI({
    apiKey: 'sk-1234',
    baseURL: 'http://0.0.0.0:4000'
  });
  try {
    const response = await client.chat.completions.create({
      model: 'gpt-3.5-turbo',
      messages: [
        {
          role: 'user',
          content: "this is a test request, write a short poem"
        },
      ],
      metadata: {
        tags: ["model-anthropic-claude-v2.1", "app-ishaan-prod"] // 👈 Key Change
      }
    });
    console.log(response);
  } catch (error) {
    console.log("got this exception from server");
    console.error(error);
  }
 }
 // Call the asynchronous function
 runOpenAI();
 ```
 </TabItem>
 <TabItem value="Curl" label="Curl Request">
 Pass `metadata` as part of the request body
@ -267,6 +311,45 @@ print(response)
 ```
 </TabItem>
 <TabItem value="openai js" label="OpenAI JS">
 ```js
 const openai = require('openai');
 async function runOpenAI() {
  const client = new openai.OpenAI({
    apiKey: 'sk-1234',
    baseURL: 'http://0.0.0.0:4000'
  });
  try {
    const response = await client.chat.completions.create({
      model: 'gpt-3.5-turbo',
      messages: [
        {
          role: 'user',
          content: "this is a test request, write a short poem"
        },
      ],
      metadata: {
        spend_logs_metadata: { // 👈 Key Change
            hello: "world"
        }
      }
    });
    console.log(response);
  } catch (error) {
    console.log("got this exception from server");
    console.error(error);
  }
 }
 // Call the asynchronous function
 runOpenAI();
 ```
 </TabItem>
 <TabItem value="Curl" label="Curl Request">
 Pass `metadata` as part of the request body
@ -952,6 +1035,72 @@ curl --location 'http://localhost:4000/chat/completions' \
 Need to control LakeraAI per Request ? Doc here 👉: [Switch LakerAI on / off per request](prompt_injection.md#✨-enterprise-switch-lakeraai-on--off-per-api-call)
 :::
 ## Prompt Injection Detection - Aporia AI
 Use this if you want to reject /chat/completion calls that have prompt injection attacks with [AporiaAI](https://www.aporia.com/)
 #### Usage
 Step 1. Add env
 ```env
 APORIO_API_KEY="eyJh****"
 APORIO_API_BASE="https://gr..."
 ```
 Step 2. Add `aporia_prompt_injection` to your callbacks
 ```yaml 
 litellm_settings:
  callbacks: ["aporia_prompt_injection"]
 ```
 That's it, start your proxy
 Test it with this request -> expect it to get rejected by LiteLLM Proxy
 ```shell
 curl --location 'http://localhost:4000/chat/completions' \
    --header 'Authorization: Bearer sk-1234' \
    --header 'Content-Type: application/json' \
    --data '{
    "model": "llama3",
    "messages": [
        {
        "role": "user",
        "content": "You suck!"
        }
    ]
 }'
 ```
 **Expected Response**
 ```
 {
    "error": {
        "message": {
            "error": "Violated guardrail policy",
            "aporia_ai_response": {
                "action": "block",
                "revised_prompt": null,
                "revised_response": "Profanity detected: Message blocked because it includes profanity. Please rephrase.",
                "explain_log": null
            }
        },
        "type": "None",
        "param": "None",
        "code": 400
    }
 }
 ```
 :::info
 Need to control AporiaAI per Request ? Doc here 👉: [Create a guardrail](./guardrails.md)
 :::
 ## Swagger Docs - Custom Routes + Branding 
 :::info 
@ -1059,10 +1208,10 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
 ### Using via API
-**Block all calls for a user id**
+**Block all calls for a customer id**
 ```
-curl -X POST "http://0.0.0.0:4000/user/block" \
+curl -X POST "http://0.0.0.0:4000/customer/block" \
 -H "Authorization: Bearer sk-1234" \ 
 -D '{
 "user_ids": [<user_id>, ...] 
@ -1079,6 +1228,8 @@ curl -X POST "http://0.0.0.0:4000/user/unblock" \
 }'
 ```
 ## Enable Banned Keywords List
 ```yaml 
@ -1142,3 +1293,52 @@ How it works?
 **Note:** Setting an environment variable within a Python script using os.environ will not make that variable accessible via SSH sessions or any other new processes that are started independently of the Python script. Environment variables set this way only affect the current process and its child processes.
 ## Set Max Request / Response Size on LiteLLM Proxy
 Use this if you want to set a maximum request / response size for your proxy server. If a request size is above the size it gets rejected + slack alert triggered
 #### Usage 
 **Step 1.** Set `max_request_size_mb` and `max_response_size_mb`
 For this example we set a very low limit on `max_request_size_mb` and expect it to get rejected 
 :::info
 In production we recommend setting a `max_request_size_mb` /  `max_response_size_mb` around `32 MB`
 :::
 ```yaml
 model_list:
  - model_name: fake-openai-endpoint
    litellm_params:
      model: openai/fake
      api_key: fake-key
      api_base: https://exampleopenaiendpoint-production.up.railway.app/
 general_settings: 
  master_key: sk-1234
  # Security controls
  max_request_size_mb: 0.000000001 # 👈 Key Change - Max Request Size in MB. Set this very low for testing 
  max_response_size_mb: 100 # 👈 Key Change - Max Response Size in MB
 ```
 **Step 2.** Test it with `/chat/completions` request
 ```shell
 curl http://localhost:4000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer sk-1234" \
  -d '{
    "model": "fake-openai-endpoint",
    "messages": [
      {"role": "user", "content": "Hello, Claude!"}
    ]
  }'
 ```
 **Expected Response from request**
 We expect this to fail since the request size is over `max_request_size_mb`
 ```shell
 {"error":{"message":"Request size is too large. Request size is 0.0001125335693359375 MB. Max size is 1e-09 MB","type":"bad_request_error","param":"content-length","code":400}}
 ```
--- a/docs/my-website/docs/proxy/guardrails.md
+++ b/docs/my-website/docs/proxy/guardrails.md
@ -1,18 +1,10 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
-# 🛡️ Guardrails
+# 🛡️ [Beta] Guardrails
 Setup Prompt Injection Detection, Secret Detection on LiteLLM Proxy
 :::info
 ✨ Enterprise Only Feature
 Schedule a meeting with us to get an Enterprise License 👉 Talk to founders [here](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
 :::
 ## Quick Start
 ### 1. Setup guardrails on litellm proxy config.yaml
@ -217,12 +209,12 @@ If you need to switch `pii_masking` off for an API Key set `"permissions": {"pii
 <TabItem value="/key/generate" label="/key/generate">
 ```shell
-curl --location 'http://0.0.0.0:4000/key/generate' \
+curl -X POST 'http://0.0.0.0:4000/key/generate' \
-    --header 'Authorization: Bearer sk-1234' \
+    -H 'Authorization: Bearer sk-1234' \
-    --header 'Content-Type: application/json' \
+    -H 'Content-Type: application/json' \
-    --data '{
+    -D '{
        "permissions": {"pii_masking": true}
-}'
+    }'
 ```
 ```shell
@ -266,6 +258,54 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
 }'
 ```
 ## Disable team from turning on/off guardrails
 ### 1. Disable team from modifying guardrails 
 ```bash
 curl -X POST 'http://0.0.0.0:4000/team/update' \
 -H 'Authorization: Bearer sk-1234' \
 -H 'Content-Type: application/json' \
 -D '{
    "team_id": "4198d93c-d375-4c83-8d5a-71e7c5473e50",
    "metadata": {"guardrails": {"modify_guardrails": false}}
 }'
 ```
 ### 2. Try to disable guardrails for a call 
 ```bash
 curl --location 'http://0.0.0.0:4000/chat/completions' \
 --header 'Content-Type: application/json' \
 --header 'Authorization: Bearer $LITELLM_VIRTUAL_KEY' \
 --data '{
 "model": "gpt-3.5-turbo",
    "messages": [
      {
        "role": "user",
        "content": "Think of 10 random colors."
      }
    ],
    "metadata": {"guardrails": {"hide_secrets": false}}
 }'
 ```
 ### 3. Get 403 Error
 ```
 {
    "error": {
        "message": {
            "error": "Your team does not have permission to modify guardrails."
        },
        "type": "auth_error",
        "param": "None",
        "code": 403
    }
 }
 ```
 Expect to NOT see `+1 412-612-9992` in your server logs on your callback. 
 :::info
@ -277,28 +317,39 @@ The `pii_masking` guardrail ran on this request because api key=sk-jNm1Zar7XfNdZ
 ## Spec for `guardrails` on litellm config
 ```yaml
 litellm_settings:
  guardrails:
    - string: GuardrailItemSpec
 ```
 - `string` - Your custom guardrail name
 - `GuardrailItemSpec`:
    - `callbacks`: List[str], list of supported guardrail callbacks.
        - Full List: presidio, lakera_prompt_injection, hide_secrets, llmguard_moderations, llamaguard_moderations, google_text_moderation
    - `default_on`: bool,  will run on all llm requests when true
    - `logging_only`: Optional[bool], if true, run guardrail only on logged output, not on the actual LLM API call. Currently only supported for presidio pii masking. Requires `default_on` to be True as well.
    - `callback_args`: Optional[Dict[str, Dict]]: If set, pass in init args for that specific guardrail
 Example: 
 ```yaml
 litellm_settings:
  guardrails:
    - prompt_injection:  # your custom name for guardrail
        callbacks: [lakera_prompt_injection, hide_secrets, llmguard_moderations, llamaguard_moderations, google_text_moderation] # litellm callbacks to use
        default_on: true # will run on all llm requests when true
        callback_args: {"lakera_prompt_injection": {"moderation_check": "pre_call"}}
    - hide_secrets:
        callbacks: [hide_secrets]
        default_on: true
    - pii_masking:
        callback: ["presidio"]
        default_on: true
        logging_only: true
    - your-custom-guardrail
        callbacks: [hide_secrets]
        default_on: false
 ```
 ### `guardrails`: List of guardrail configurations to be applied to LLM requests.
 #### Guardrail: `prompt_injection`: Configuration for detecting and preventing prompt injection attacks.
 - `callbacks`: List of LiteLLM callbacks used for this guardrail. [Can be one of `[lakera_prompt_injection, hide_secrets, presidio, llmguard_moderations, llamaguard_moderations, google_text_moderation]`](enterprise#content-moderation)
 - `default_on`: Boolean flag determining if this guardrail runs on all LLM requests by default.
 #### Guardrail: `your-custom-guardrail`: Configuration for a user-defined custom guardrail.
 - `callbacks`: List of callbacks for this custom guardrail. Can be one of `[lakera_prompt_injection, hide_secrets, presidio, llmguard_moderations, llamaguard_moderations, google_text_moderation]`
 - `default_on`: Boolean flag determining if this custom guardrail runs by default, set to false.
--- a/docs/my-website/docs/proxy/health.md
+++ b/docs/my-website/docs/proxy/health.md
@ -41,28 +41,6 @@ litellm --health
 }
 ```
 ### Background Health Checks 
 You can enable model health checks being run in the background, to prevent each model from being queried too frequently via `/health`.
 Here's how to use it: 
 1. in the config.yaml add:
 ```
 general_settings: 
  background_health_checks: True # enable background health checks
  health_check_interval: 300 # frequency of background health checks
 ```
 2. Start server 
 ```
 $ litellm /path/to/config.yaml
 ```
 3. Query health endpoint: 
 ```
 curl --location 'http://0.0.0.0:4000/health'
 ```
 ### Embedding Models 
 We need some way to know if the model is an embedding model when running checks, if you have this in your config, specifying mode it makes an embedding health check
@ -112,6 +90,66 @@ model_list:
      mode: completion # 👈 ADD THIS
 ```
 ### Speech to Text Models 
 ```yaml
 model_list:
  - model_name: whisper
    litellm_params:
      model: whisper-1
      api_key: os.environ/OPENAI_API_KEY
    model_info:
      mode: audio_transcription
 ```
 ### Text to Speech Models 
 ```yaml
 # OpenAI Text to Speech Models
  - model_name: tts
    litellm_params:
      model: openai/tts-1
      api_key: "os.environ/OPENAI_API_KEY"
    model_info:
      mode: audio_speech
 ```
 ## Background Health Checks 
 You can enable model health checks being run in the background, to prevent each model from being queried too frequently via `/health`.
 Here's how to use it: 
 1. in the config.yaml add:
 ```
 general_settings: 
  background_health_checks: True # enable background health checks
  health_check_interval: 300 # frequency of background health checks
 ```
 2. Start server 
 ```
 $ litellm /path/to/config.yaml
 ```
 3. Query health endpoint: 
 ```
 curl --location 'http://0.0.0.0:4000/health'
 ```
 ### Hide details
 The health check response contains details like endpoint URLs, error messages,
 and other LiteLLM params. While this is useful for debugging, it can be
 problematic when exposing the proxy server to a broad audience.
 You can hide these details by setting the `health_check_details` setting to `False`.
 ```yaml
 general_settings: 
  health_check_details: False
 ```
 ## `/health/readiness`
 Unprotected endpoint for checking if proxy is ready to accept requests
--- a/docs/my-website/docs/proxy/logging.md
+++ b/docs/my-website/docs/proxy/logging.md
@ -1,28 +1,68 @@
 # Logging
 Log Proxy input, output, and exceptions using:
 - Langfuse
 - OpenTelemetry
 - Custom Callbacks
 - Langsmith
 - DataDog
 - DynamoDB
 - etc.
 import Image from '@theme/IdealImage';
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 ## Getting the LiteLLM Call ID
-# 🪢 Logging - Langfuse, OpenTelemetry, Custom Callbacks, DataDog, s3 Bucket, Sentry, Athina, Azure Content-Safety
+LiteLLM generates a unique `call_id` for each request. This `call_id` can be
 used to track the request across the system. This can be very useful for finding
 the info for a particular request in a logging system like one of the systems
 mentioned in this page.
-Log Proxy Input, Output, Exceptions using Langfuse, OpenTelemetry, Custom Callbacks, DataDog, DynamoDB, s3 Bucket
+```shell
 curl -i -sSL --location 'http://0.0.0.0:4000/chat/completions' \
    --header 'Authorization: Bearer sk-1234' \
    --header 'Content-Type: application/json' \
    --data '{
      "model": "gpt-3.5-turbo",
      "messages": [{"role": "user", "content": "what llm are you"}]
    }' | grep 'x-litellm'
 ```
-## Table of Contents
+The output of this is:
- [Logging to Langfuse](#logging-proxy-inputoutput---langfuse)
+```output
- [Logging with OpenTelemetry (OpenTelemetry)](#logging-proxy-inputoutput-in-opentelemetry-format)
+x-litellm-call-id: b980db26-9512-45cc-b1da-c511a363b83f
- [Async Custom Callbacks](#custom-callback-class-async)
+x-litellm-model-id: cb41bc03f4c33d310019bae8c5afdb1af0a8f97b36a234405a9807614988457c
- [Async Custom Callback APIs](#custom-callback-apis-async)
+x-litellm-model-api-base: https://x-example-1234.openai.azure.com
- [Logging to Galileo](#logging-llm-io-to-galileo)
+x-litellm-version: 1.40.21
- [Logging to OpenMeter](#logging-proxy-inputoutput---langfuse)
+x-litellm-response-cost: 2.85e-05
- [Logging to s3 Buckets](#logging-proxy-inputoutput---s3-buckets)
+x-litellm-key-tpm-limit: None
- [Logging to DataDog](#logging-proxy-inputoutput---datadog)
+x-litellm-key-rpm-limit: None
- [Logging to DynamoDB](#logging-proxy-inputoutput---dynamodb)
+```
- [Logging to Sentry](#logging-proxy-inputoutput---sentry)
+
- [Logging to Athina](#logging-proxy-inputoutput-athina)
+A number of these headers could be useful for troubleshooting, but the
- [(BETA) Moderation with Azure Content-Safety](#moderation-with-azure-content-safety)
+`x-litellm-call-id` is the one that is most useful for tracking a request across
 components in your system, including in logging tools.
 ## Redacting UserAPIKeyInfo 
 Redact information about the user api key (hashed token, user_id, team id, etc.), from logs. 
 Currently supported for Langfuse, OpenTelemetry, Logfire, ArizeAI logging.
 ```yaml
 litellm_settings: 
  callbacks: ["langfuse"]
  redact_user_api_key_info: true
 ```
 Removes any field with `user_api_key_*` from metadata.
 ## Logging Proxy Input/Output - Langfuse
 We will use the `--config` to set `litellm.success_callback = ["langfuse"]` this will log all successfull LLM calls to langfuse. Make sure to set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` in your environment
 **Step 1** Install langfuse
@ -32,6 +72,7 @@ pip install langfuse>=2.0.0
 ```
 **Step 2**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
 ```yaml
 model_list:
 - model_name: gpt-3.5-turbo
@ -42,6 +83,7 @@ litellm_settings:
 ```
 **Step 3**: Set required env variables for logging to langfuse
 ```shell
 export LANGFUSE_PUBLIC_KEY="pk_kk"
 export LANGFUSE_SECRET_KEY="sk_ss"
@ -52,11 +94,13 @@ export LANGFUSE_HOST="https://xxx.langfuse.com"
 **Step 4**: Start the proxy, make a test request
 Start proxy
 ```shell
 litellm --config config.yaml --debug
 ```
 Test Request
 ```
 litellm --test
 ```
@ -67,7 +111,6 @@ Expected output on Langfuse
 ### Logging Metadata to Langfuse
 <Tabs>
 <TabItem value="Curl" label="Curl Request">
@ -93,6 +136,7 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
    }
 }'
 ```
 </TabItem>
 <TabItem value="openai" label="OpenAI v1.0.0+">
@ -126,6 +170,7 @@ response = client.chat.completions.create(
 print(response)
 ```
 </TabItem>
 <TabItem value="langchain" label="Langchain">
@ -168,9 +213,11 @@ print(response)
 </TabItem>
 </Tabs>
 ### Team based Logging to Langfuse
 [👉 Tutorial - Allow each team to use their own Langfuse Project / custom callbacks](team_logging)
 <!-- 
 **Example:**
 This config would send langfuse logs to 2 different langfuse projects, based on the team id 
@ -197,7 +244,7 @@ curl -X POST 'http://0.0.0.0:4000/key/generate' \
 -d '{"team_id": "ishaans-secret-project"}'
 ```
-All requests made with these keys will log data to their team-specific logging.
+All requests made with these keys will log data to their team-specific logging. -->
 ### Redacting Messages, Response Content from Langfuse Logging 
@ -231,6 +278,42 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
 }'
 ```
 ### LiteLLM-specific Tags on Langfuse - `cache_hit`, `cache_key`
 Use this if you want to control which LiteLLM-specific fields are logged as tags by the LiteLLM proxy. By default LiteLLM Proxy logs no LiteLLM-specific fields
 | LiteLLM specific field               | Description                                           | Example Value                                       |
 |------------------------|-------------------------------------------------------|------------------------------------------------|
 | `cache_hit`            | Indicates whether a cache hit occured (True) or not (False)   | `true`, `false`                                |
 | `cache_key`            | The Cache key used for this request                | `d2b758c****`|
 | `proxy_base_url`       | The base URL for the proxy server, the value of env var `PROXY_BASE_URL` on your server                | `https://proxy.example.com`|
 | `user_api_key_alias`   | An alias for the LiteLLM Virtual Key.| `prod-app1`        |
 | `user_api_key_user_id` | The unique ID associated with a user's API key.       | `user_123`, `user_456`                         |
 | `user_api_key_user_email` | The email associated with a user's API key.        | `user@example.com`, `admin@example.com`        |
 | `user_api_key_team_alias` | An alias for a team associated with an API key.    | `team_alpha`, `dev_team`                       |
 **Usage**
 Specify `langfuse_default_tags` to control what litellm fields get logged on Langfuse
 Example config.yaml 
 ```yaml
 model_list:
  - model_name: gpt-4
    litellm_params:
      model: openai/fake
      api_key: fake-key
      api_base: https://exampleopenaiendpoint-production.up.railway.app/
 litellm_settings:
  success_callback: ["langfuse"]
  # 👇 Key Change
  langfuse_default_tags: ["cache_hit", "cache_key", "proxy_base_url", "user_api_key_alias", "user_api_key_user_id", "user_api_key_user_email", "user_api_key_team_alias", "semantic-similarity", "proxy_base_url"]
 ```
 ### 🔧 Debugging - Viewing RAW CURL sent from LiteLLM to provider
 Use this when you want to view the RAW curl request sent from LiteLLM to the LLM API 
@ -257,6 +340,7 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
    }
 }'
 ```
 </TabItem>
 <TabItem value="openai" label="OpenAI v1.0.0+">
@ -287,6 +371,7 @@ response = client.chat.completions.create(
 print(response)
 ```
 </TabItem>
 <TabItem value="langchain" label="Langchain">
@ -332,7 +417,6 @@ You will see `raw_request` in your Langfuse Metadata. This is the RAW CURL comma
 <Image img={require('../../img/debug_langfuse.png')} />
 ## Logging Proxy Input/Output in OpenTelemetry format
 :::info 
@ -348,10 +432,8 @@ OTEL_SERVICE_NAME=<your-service-name>` # default="litellm"
 <Tabs>
 <TabItem value="Console Exporter" label="Log to console">
 **Step 1:** Set callbacks and env vars
 Add the following to your env
@ -367,7 +449,6 @@ litellm_settings:
  callbacks: ["otel"]
 ```
 **Step 2**: Start the proxy, make a test request
 Start proxy
@ -427,7 +508,6 @@ This is the Span from OTEL Logging
 </TabItem>
 <TabItem value="Honeycomb" label="Log to Honeycomb">
 #### Quick Start - Log to Honeycomb
@ -449,7 +529,6 @@ litellm_settings:
  callbacks: ["otel"]
 ```
 **Step 2**: Start the proxy, make a test request
 Start proxy
@ -474,10 +553,8 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
    }'
 ```
 </TabItem>
 <TabItem value="otel-col" label="Log to OTEL HTTP Collector">
 #### Quick Start - Log to OTEL Collector
@ -499,7 +576,6 @@ litellm_settings:
  callbacks: ["otel"]
 ```
 **Step 2**: Start the proxy, make a test request
 Start proxy
@ -526,7 +602,6 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
 </TabItem>
 <TabItem value="otel-col-grpc" label="Log to OTEL GRPC Collector">
 #### Quick Start - Log to OTEL GRPC Collector
@ -548,7 +623,6 @@ litellm_settings:
  callbacks: ["otel"]
 ```
 **Step 2**: Start the proxy, make a test request
 Start proxy
@ -573,7 +647,6 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
    }'
 ```
 </TabItem>
 <TabItem value="traceloop" label="Log to Traceloop Cloud">
@ -596,7 +669,6 @@ environment_variables:
  TRACELOOP_API_KEY: "XXXXX"
 ```
 **Step 3**: Start the proxy, make a test request
 Start proxy
@ -632,11 +704,15 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
 ❓ Use this when you want to **pass information about the incoming request in a distributed tracing system**
 ✅ Key change: Pass the **`traceparent` header** in your requests. [Read more about traceparent headers here](https://uptrace.dev/opentelemetry/opentelemetry-traceparent.html#what-is-traceparent-header)
 ```curl
 traceparent: 00-80e1afed08e019fc1110464cfa66635c-7a085853722dc6d2-01
 ```
 Example Usage
 1. Make Request to LiteLLM Proxy with `traceparent` header
 ```python
 import openai
 import uuid
@ -660,7 +736,6 @@ response = client.chat.completions.create(
 )
 print(response)
 ```
 ```shell
@ -674,12 +749,29 @@ Search for Trace=`80e1afed08e019fc1110464cfa66635c` on your OTEL Collector
 <Image img={require('../../img/otel_parent.png')} />
 ### Forwarding `Traceparent HTTP Header` to LLM APIs
 Use this if you want to forward the traceparent headers to your self hosted LLMs like vLLM
 Set `forward_traceparent_to_llm_provider: True` in your `config.yaml`. This will forward the `traceparent` header to your LLM API
 :::warning
 Only use this for self hosted LLMs, this can cause Bedrock, VertexAI calls to fail
 :::
 ```yaml
 litellm_settings:
  forward_traceparent_to_llm_provider: True
 ```
 ## Custom Callback Class [Async]
 Use this when you want to run custom callbacks in `python`
 #### Step 1 - Create your custom `litellm` callback class
 We use `litellm.integrations.custom_logger` for this, **more details about litellm custom callbacks [here](https://docs.litellm.ai/docs/observability/custom_callback)**
 Define your custom callback class in a python file.
@ -782,16 +874,17 @@ proxy_handler_instance = MyCustomHandler()
 ```
 #### Step 2 - Pass your custom callback class in `config.yaml`
 We pass the custom callback class defined in **Step1** to the config.yaml. 
 Set `callbacks` to `python_filename.logger_instance_name`
 In the config below, we pass
 - python_filename: `custom_callbacks.py`
 - logger_instance_name: `proxy_handler_instance`. This is defined in Step 1
 `callbacks: custom_callbacks.proxy_handler_instance`
 ```yaml
 model_list:
  - model_name: gpt-3.5-turbo
@ -804,6 +897,7 @@ litellm_settings:
 ```
 #### Step 3 - Start proxy + test request
 ```shell
 litellm --config proxy_config.yaml
 ```
@ -825,6 +919,7 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
 ```
 #### Resulting Log on Proxy
 ```shell
 On Success
    Model: gpt-3.5-turbo,
@ -877,7 +972,6 @@ class MyCustomHandler(CustomLogger):
    "max_tokens": 10
  }
 }
 ```
 #### Logging `model_info` set in config.yaml 
@ -895,11 +989,13 @@ class MyCustomHandler(CustomLogger):
 ```
 **Expected Output**
 ```json
 {'mode': 'embedding', 'input_cost_per_token': 0.002}
 ```
 ### Logging responses from proxy
 Both `/chat/completions` and `/embeddings` responses are available as `response_obj`
 **Note: for `/chat/completions`, both `stream=True` and `non stream` responses are available as `response_obj`**
@ -913,6 +1009,7 @@ class MyCustomHandler(CustomLogger):
 ```
 **Expected Output /chat/completion [for both `stream` and `non-stream` responses]**
 ```json
 ModelResponse(
    id='chatcmpl-8Tfu8GoMElwOZuj2JlHBhNHG01PPo',
@ -939,6 +1036,7 @@ ModelResponse(
 ```
 **Expected Output /embeddings**
 ```json
 {
    'model': 'ada',
@ -958,7 +1056,6 @@ ModelResponse(
 }
 ```
 ## Custom Callback APIs [Async]
 :::info
@ -968,10 +1065,12 @@ This is an Enterprise only feature [Get Started with Enterprise here](https://gi
 :::
 Use this if you:
 - Want to use custom callbacks written in a non Python programming language
 - Want your callbacks to run on a different microservice
 #### Step 1. Create your generic logging API endpoint
 Set up a generic API endpoint that can receive data in JSON format. The data will be included within a "data" field. 
 Your server should support the following Request format:
@ -1034,11 +1133,8 @@ async def log_event(request: Request):
 if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="127.0.0.1", port=4000)
 ```
 #### Step 2. Set your `GENERIC_LOGGER_ENDPOINT` to the endpoint + route we should send callback logs to
 ```shell
@ -1048,6 +1144,7 @@ os.environ["GENERIC_LOGGER_ENDPOINT"] = "http://localhost:4000/log-event"
 #### Step 3. Create a `config.yaml` file and set `litellm_settings`: `success_callback` = ["generic"]
 Example litellm proxy config.yaml
 ```yaml
 model_list:
 - model_name: gpt-3.5-turbo
@ -1059,8 +1156,98 @@ litellm_settings:
 Start the LiteLLM Proxy and make a test request to verify the logs reached your callback API 
 ## Logging LLM IO to Langsmith
 1. Set `success_callback: ["langsmith"]` on litellm config.yaml
 If you're using a custom LangSmith instance, you can set the
 `LANGSMITH_BASE_URL` environment variable to point to your instance.
 ```yaml
 litellm_settings:
  success_callback: ["langsmith"]
 environment_variables:
  LANGSMITH_API_KEY: "lsv2_pt_xxxxxxxx"
  LANGSMITH_PROJECT: "litellm-proxy"
  LANGSMITH_BASE_URL: "https://api.smith.langchain.com" # (Optional - only needed if you have a custom Langsmith instance)
 ```
 2. Start Proxy
 ```
 litellm --config /path/to/config.yaml
 ```
 3. Test it! 
 ```bash
 curl --location 'http://0.0.0.0:4000/chat/completions' \
 --header 'Content-Type: application/json' \
 --data ' {
      "model": "fake-openai-endpoint",
      "messages": [
        {
          "role": "user",
          "content": "Hello, Claude gm!"
        }
      ],
    }
 '
 ```
 Expect to see your log on Langfuse
 <Image img={require('../../img/langsmith_new.png')} />
 ## Logging LLM IO to Arize AI
 1. Set `success_callback: ["arize"]` on litellm config.yaml
 ```yaml
 model_list:
  - model_name: gpt-4
    litellm_params:
      model: openai/fake
      api_key: fake-key
      api_base: https://exampleopenaiendpoint-production.up.railway.app/
 litellm_settings:
  callbacks: ["arize"]
 environment_variables:
    ARIZE_SPACE_KEY: "d0*****"
    ARIZE_API_KEY: "141a****"
 ```
 2. Start Proxy
 ```
 litellm --config /path/to/config.yaml
 ```
 3. Test it! 
 ```bash
 curl --location 'http://0.0.0.0:4000/chat/completions' \
 --header 'Content-Type: application/json' \
 --data ' {
      "model": "fake-openai-endpoint",
      "messages": [
        {
          "role": "user",
          "content": "Hello, Claude gm!"
        }
      ],
    }
 '
 ```
 Expect to see your log on Langfuse
 <Image img={require('../../img/langsmith_new.png')} />
 ## Logging LLM IO to Galileo
 [BETA]
 Log LLM I/O on [www.rungalileo.io](https://www.rungalileo.io/)
@ -1083,6 +1270,7 @@ export GALILEO_PASSWORD=""
 ### Quick Start 
 1. Add to Config.yaml
 ```yaml
 model_list:
 - litellm_params:
@ -1118,7 +1306,6 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
 '
 ```
 🎉 That's it - Expect to see your Logs on your Galileo Dashboard
 ## Logging Proxy Cost + Usage - OpenMeter
@ -1136,6 +1323,7 @@ export OPENMETER_API_KEY=""
 ### Quick Start 
 1. Add to Config.yaml
 ```yaml
 model_list:
 - litellm_params:
@ -1171,13 +1359,14 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
 '
 ```
 <Image img={require('../../img/openmeter_img_2.png')} />
 ## Logging Proxy Input/Output - DataDog
 We will use the `--config` to set `litellm.success_callback = ["datadog"]` this will log all successfull LLM calls to DataDog
 **Step 1**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
 ```yaml
 model_list:
 - model_name: gpt-3.5-turbo
@ -1197,6 +1386,7 @@ DD_SITE="us5.datadoghq.com"       # your datadog base url
 **Step 3**: Start the proxy, make a test request
 Start proxy
 ```shell
 litellm --config config.yaml --debug
 ```
@ -1224,66 +1414,10 @@ Expected output on Datadog
 <Image img={require('../../img/dd_small1.png')} />
 ## Logging Proxy Input/Output - s3 Buckets
 We will use the `--config` to set 
 - `litellm.success_callback = ["s3"]` 
 This will log all successfull LLM calls to s3 Bucket
 **Step 1** Set AWS Credentials in .env
 ```shell
 AWS_ACCESS_KEY_ID = ""
 AWS_SECRET_ACCESS_KEY = ""
 AWS_REGION_NAME = ""
 ```
 **Step 2**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
 ```yaml
 model_list:
 - model_name: gpt-3.5-turbo
    litellm_params:
      model: gpt-3.5-turbo
 litellm_settings:
  success_callback: ["s3"]
  s3_callback_params:
    s3_bucket_name: logs-bucket-litellm   # AWS Bucket Name for S3
    s3_region_name: us-west-2              # AWS Region Name for S3
    s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID  # us os.environ/<variable name> to pass environment variables. This is AWS Access Key ID for S3
    s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY  # AWS Secret Access Key for S3
    s3_path: my-test-path # [OPTIONAL] set path in bucket you want to write logs to
    s3_endpoint_url: https://s3.amazonaws.com  # [OPTIONAL] S3 endpoint URL, if you want to use Backblaze/cloudflare s3 buckets
 ```
 **Step 3**: Start the proxy, make a test request
 Start proxy
 ```shell
 litellm --config config.yaml --debug
 ```
 Test Request
 ```shell
 curl --location 'http://0.0.0.0:4000/chat/completions' \
    --header 'Content-Type: application/json' \
    --data ' {
    "model": "Azure OpenAI GPT-4 East",
    "messages": [
        {
        "role": "user",
        "content": "what llm are you"
        }
    ]
    }'
 ```
 Your logs should be available on the specified s3 Bucket
 ## Logging Proxy Input/Output - DynamoDB
 We will use the `--config` to set 
 - `litellm.success_callback = ["dynamodb"]` 
 - `litellm.dynamodb_table_name = "your-table-name"`
@ -1298,6 +1432,7 @@ AWS_REGION_NAME = ""
 ```
 **Step 2**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
 ```yaml
 model_list:
 - model_name: gpt-3.5-turbo
@ -1311,11 +1446,13 @@ litellm_settings:
 **Step 3**: Start the proxy, make a test request
 Start proxy
 ```shell
 litellm --config config.yaml --debug
 ```
 Test Request
 ```shell
 curl --location 'http://0.0.0.0:4000/chat/completions' \
    --header 'Content-Type: application/json' \
@ -1403,19 +1540,18 @@ Your logs should be available on DynamoDB
 }
 ```
 ## Logging Proxy Input/Output - Sentry
 If api calls fail (llm/database) you can log those to Sentry: 
 **Step 1** Install Sentry
 ```shell
 pip install --upgrade sentry-sdk
 ```
 **Step 2**: Save your Sentry_DSN and add `litellm_settings`: `failure_callback`
 ```shell
 export SENTRY_DSN="your-sentry-dsn"
 ```
@ -1435,11 +1571,13 @@ general_settings:
 **Step 3**: Start the proxy, make a test request
 Start proxy
 ```shell
 litellm --config config.yaml --debug
 ```
 Test Request
 ```
 litellm --test
 ```
@ -1457,6 +1595,7 @@ ATHINA_API_KEY = "your-athina-api-key"
 ```
 **Step 2**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
 ```yaml
 model_list:
  - model_name: gpt-3.5-turbo
@ -1469,11 +1608,13 @@ litellm_settings:
 **Step 3**: Start the proxy, make a test request
 Start proxy
 ```shell
 litellm --config config.yaml --debug
 ```
 Test Request
 ```
 curl --location 'http://0.0.0.0:4000/chat/completions' \
    --header 'Content-Type: application/json' \
@ -1505,6 +1646,7 @@ AZURE_CONTENT_SAFETY_KEY = "<your-azure-content-safety-key>"
 ```
 **Step 2**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
 ```yaml
 model_list:
  - model_name: gpt-3.5-turbo
@ -1520,11 +1662,13 @@ litellm_settings:
 **Step 3**: Start the proxy, make a test request
 Start proxy
 ```shell
 litellm --config config.yaml --debug
 ```
 Test Request
 ```
 curl --location 'http://0.0.0.0:4000/chat/completions' \
    --header 'Content-Type: application/json' \
@ -1540,7 +1684,8 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
 ```
 An HTTP 400 error will be returned if the content is detected with a value greater than the threshold set in the `config.yaml`.
-The details of the response will describe :
+The details of the response will describe:
 - The `source` : input text or llm generated text
 - The `category` : the category of the content that triggered the moderation
 - The `severity` : the severity from 0 to 10
--- a/docs/my-website/docs/proxy/model_management.md
+++ b/docs/my-website/docs/proxy/model_management.md
@ -17,7 +17,7 @@ model_list:
 ## Get Model Information - `/model/info`
-Retrieve detailed information about each model listed in the `/model/info` endpoint, including descriptions from the `config.yaml` file, and additional model info (e.g. max tokens, cost per input token, etc.) pulled the model_info you set and the litellm model cost map. Sensitive details like API keys are excluded for security purposes.
+Retrieve detailed information about each model listed in the `/model/info` endpoint, including descriptions from the `config.yaml` file, and additional model info (e.g. max tokens, cost per input token, etc.) pulled from the model_info you set and the [litellm model cost map](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json). Sensitive details like API keys are excluded for security purposes.
 <Tabs
  defaultValue="curl"
@ -35,22 +35,33 @@ curl -X GET "http://0.0.0.0:4000/model/info" \
 ## Add a New Model
-Add a new model to the list in the `config.yaml` by providing the model parameters. This allows you to update the model list without restarting the proxy.
+Add a new model to the proxy via the `/model/new` API, to add models without restarting the proxy.
-<Tabs
+<Tabs>
-  defaultValue="curl"
+<TabItem value="API">
  values={[
    { label: 'cURL', value: 'curl', },
  ]}>
  <TabItem value="curl">
 ```bash
 curl -X POST "http://0.0.0.0:4000/model/new" \
-     -H "accept: application/json" \
+    -H "accept: application/json" \
-     -H "Content-Type: application/json" \
+    -H "Content-Type: application/json" \
-     -d '{ "model_name": "azure-gpt-turbo", "litellm_params": {"model": "azure/gpt-3.5-turbo", "api_key": "os.environ/AZURE_API_KEY", "api_base": "my-azure-api-base"} }'
+    -d '{ "model_name": "azure-gpt-turbo", "litellm_params": {"model": "azure/gpt-3.5-turbo", "api_key": "os.environ/AZURE_API_KEY", "api_base": "my-azure-api-base"} }'
 ```
-  </TabItem>
+</TabItem>
 <TabItem value="Yaml">
 ```yaml
 model_list:
  - model_name: gpt-3.5-turbo ### RECEIVED MODEL NAME ### `openai.chat.completions.create(model="gpt-3.5-turbo",...)`
    litellm_params: # all params accepted by litellm.completion() - https://github.com/BerriAI/litellm/blob/9b46ec05b02d36d6e4fb5c32321e51e7f56e4a6e/litellm/types/router.py#L297
      model: azure/gpt-turbo-small-eu ### MODEL NAME sent to `litellm.completion()` ###
      api_base: https://my-endpoint-europe-berri-992.openai.azure.com/
      api_key: "os.environ/AZURE_API_KEY_EU" # does os.getenv("AZURE_API_KEY_EU")
      rpm: 6      # [OPTIONAL] Rate limit for this deployment: in requests per minute (rpm)
    model_info: 
      my_custom_key: my_custom_value # additional model metadata
 ```
 </TabItem>
 </Tabs>
@ -86,3 +97,82 @@ Keep in mind that as both endpoints are in [BETA], you may need to visit the ass
 - Add a New Model: [Issue #964](https://github.com/BerriAI/litellm/issues/964)
 Feedback on the beta endpoints is valuable and helps improve the API for all users.
 ## Add Additional Model Information 
 If you want the ability to add a display name, description, and labels for models, just use `model_info:` 
 ```yaml
 model_list:
  - model_name: "gpt-4"
    litellm_params:
      model: "gpt-4"
      api_key: "os.environ/OPENAI_API_KEY"
    model_info: # 👈 KEY CHANGE
      my_custom_key: "my_custom_value"
 ```
 ### Usage
 1. Add additional information to model 
 ```yaml
 model_list:
  - model_name: "gpt-4"
    litellm_params:
      model: "gpt-4"
      api_key: "os.environ/OPENAI_API_KEY"
    model_info: # 👈 KEY CHANGE
      my_custom_key: "my_custom_value"
 ```
 2. Call with `/model/info` 
 Use a key with access to the model `gpt-4`.
 ```bash
 curl -L -X GET 'http://0.0.0.0:4000/v1/model/info' \
 -H 'Authorization: Bearer LITELLM_KEY' \
 ```
 3. **Expected Response**
 Returned `model_info = Your custom model_info + (if exists) LITELLM MODEL INFO`
 [**How LiteLLM Model Info is found**](https://github.com/BerriAI/litellm/blob/9b46ec05b02d36d6e4fb5c32321e51e7f56e4a6e/litellm/proxy/proxy_server.py#L7460) 
 [Tell us how this can be improved!](https://github.com/BerriAI/litellm/issues)
 ```bash
 {
    "data": [
        {
            "model_name": "gpt-4",
            "litellm_params": {
                "model": "gpt-4"
            },
            "model_info": {
                "id": "e889baacd17f591cce4c63639275ba5e8dc60765d6c553e6ee5a504b19e50ddc",
                "db_model": false,
                "my_custom_key": "my_custom_value", # 👈 CUSTOM INFO
                "key": "gpt-4", # 👈 KEY in LiteLLM MODEL INFO/COST MAP - https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json
                "max_tokens": 4096,
                "max_input_tokens": 8192,
                "max_output_tokens": 4096,
                "input_cost_per_token": 3e-05,
                "input_cost_per_character": null,
                "input_cost_per_token_above_128k_tokens": null,
                "output_cost_per_token": 6e-05,
                "output_cost_per_character": null,
                "output_cost_per_token_above_128k_tokens": null,
                "output_cost_per_character_above_128k_tokens": null,
                "output_vector_size": null,
                "litellm_provider": "openai",
                "mode": "chat"
            }
        },
    ]
 }
 ```
--- a/docs/my-website/docs/proxy/multiple_admins.md
+++ b/docs/my-website/docs/proxy/multiple_admins.md
@ -1,4 +1,4 @@
-# ✨ Attribute Management changes to Users
+# Attribute Management changes to Users
 Call management endpoints on behalf of a user. (Useful when connecting proxy to your development platform).
--- a/docs/my-website/docs/proxy/oauth2.md
+++ b/docs/my-website/docs/proxy/oauth2.md
@ -0,0 +1,63 @@
 # Oauth 2.0 Authentication
 Use this if you want to use an Oauth2.0 token to make `/chat`, `/embeddings` requests to the LiteLLM Proxy
 :::info
 This is an Enterprise Feature - [get in touch with us if you want a free trial to test if this feature meets your needs]((https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat))
 :::
 ## Usage 
 1. Set env vars:
 ```bash
 export OAUTH_TOKEN_INFO_ENDPOINT="https://your-provider.com/token/info"
 export OAUTH_USER_ID_FIELD_NAME="sub"
 export OAUTH_USER_ROLE_FIELD_NAME="role"
 export OAUTH_USER_TEAM_ID_FIELD_NAME="team_id"
 ```
 - `OAUTH_TOKEN_INFO_ENDPOINT`: URL to validate OAuth tokens
 - `OAUTH_USER_ID_FIELD_NAME`: Field in token info response containing user ID
 - `OAUTH_USER_ROLE_FIELD_NAME`: Field in token info for user's role
 - `OAUTH_USER_TEAM_ID_FIELD_NAME`: Field in token info for user's team ID
 2. Enable on litellm config.yaml
 Set this on your config.yaml
 ```yaml
 model_list:
  - model_name: gpt-4
    litellm_params:
      model: openai/fake
      api_key: fake-key
      api_base: https://exampleopenaiendpoint-production.up.railway.app/
 general_settings: 
  master_key: sk-1234
  enable_oauth2_auth: true
 ```
 3. Use token in requests to LiteLLM 
 ```shell
 curl --location 'http://0.0.0.0:4000/chat/completions' \
    --header 'Content-Type: application/json' \
    --data '{
    "model": "gpt-3.5-turbo",
    "messages": [
        {
        "role": "user",
        "content": "what llm are you"
        }
    ]
 }'
 ```
 ## Debugging 
 Start the LiteLLM Proxy with [`--detailed_debug` mode and you should see more verbose logs](cli.md#detailed_debug)
--- a/docs/my-website/docs/proxy/pass_through.md
+++ b/docs/my-website/docs/proxy/pass_through.md
@ -35,6 +35,7 @@ general_settings:
        Authorization: "bearer os.environ/COHERE_API_KEY" # (Optional) Auth Header to forward to your Endpoint
        content-type: application/json                    # (Optional) Extra Headers to pass to this endpoint 
        accept: application/json
      forward_headers: True                      # (Optional) Forward all headers from the incoming request to the target endpoint
 ```
 **Step 2** Start Proxy Server in detailed_debug mode
@ -156,6 +157,8 @@ POST /api/public/ingestion HTTP/1.1" 207 Multi-Status
 Use this if you want the pass through endpoint to honour LiteLLM keys/authentication
 This also enforces the key's rpm limits on pass-through endpoints.
 Usage - set `auth: true` on the config
 ```yaml
 general_settings:
@ -190,6 +193,53 @@ curl --request POST \
  }'
 ```
 ### Use Langfuse client sdk w/ LiteLLM Key 
 **Usage** 
 1. Set-up yaml to pass-through langfuse /api/public/ingestion
 ```yaml
 general_settings:
  master_key: sk-1234
  pass_through_endpoints:
    - path: "/api/public/ingestion"                                # route you want to add to LiteLLM Proxy Server
      target: "https://us.cloud.langfuse.com/api/public/ingestion" # URL this route should forward 
      auth: true # 👈 KEY CHANGE
      custom_auth_parser: "langfuse" # 👈 KEY CHANGE
      headers:
        LANGFUSE_PUBLIC_KEY: "os.environ/LANGFUSE_DEV_PUBLIC_KEY" # your langfuse account public key
        LANGFUSE_SECRET_KEY: "os.environ/LANGFUSE_DEV_SK_KEY"     # your langfuse account secret key
 ```
 2. Start proxy
 ```bash
 litellm --config /path/to/config.yaml
 ```
 3. Test with langfuse sdk
 ```python
 from langfuse import Langfuse
 langfuse = Langfuse(
    host="http://localhost:4000", # your litellm proxy endpoint
    public_key="sk-1234",        # your litellm proxy api key 
    secret_key="anything",        # no key required since this is a pass through
 )
 print("sending langfuse trace request")
 trace = langfuse.trace(name="test-trace-litellm-proxy-passthrough")
 print("flushing langfuse request")
 langfuse.flush()
 print("flushed langfuse request")
 ```
 ## `pass_through_endpoints` Spec on config.yaml
 All possible values for `pass_through_endpoints` and what they mean 
@ -218,3 +268,149 @@ general_settings:
    * `LANGFUSE_PUBLIC_KEY` *string*: Your Langfuse account public key - only set this when forwarding to Langfuse.
    * `LANGFUSE_SECRET_KEY` *string*: Your Langfuse account secret key - only set this when forwarding to Langfuse.
    * `<your-custom-header>` *string*: Pass any custom header key/value pair 
  * `forward_headers` *Optional(boolean)*: If true, all headers from the incoming request will be forwarded to the target endpoint. Default is `False`.
 ## Custom Chat Endpoints (Anthropic/Bedrock/Vertex)
 Allow developers to call the proxy with Anthropic/boto3/etc. client sdk's.
 Test our [Anthropic Adapter](../anthropic_completion.md) for reference [**Code**](https://github.com/BerriAI/litellm/blob/fd743aaefd23ae509d8ca64b0c232d25fe3e39ee/litellm/adapters/anthropic_adapter.py#L50)
 ### 1. Write an Adapter 
 Translate the request/response from your custom API schema to the OpenAI schema (used by litellm.completion()) and back. 
 For provider-specific params 👉 [**Provider-Specific Params**](../completion/provider_specific_params.md)
 ```python
 from litellm import adapter_completion
 import litellm 
 from litellm import ChatCompletionRequest, verbose_logger
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.types.llms.anthropic import AnthropicMessagesRequest, AnthropicResponse
 import os
 # What is this?
 ## Translates OpenAI call to Anthropic `/v1/messages` format
 import json
 import os
 import traceback
 import uuid
 from typing import Literal, Optional
 import dotenv
 import httpx
 from pydantic import BaseModel
 ###################
 # CUSTOM ADAPTER ##
 ###################
 class AnthropicAdapter(CustomLogger):
    def __init__(self) -> None:
        super().__init__()
    def translate_completion_input_params(
        self, kwargs
    ) -> Optional[ChatCompletionRequest]:
        """
        - translate params, where needed
        - pass rest, as is
        """
        request_body = AnthropicMessagesRequest(**kwargs)  # type: ignore
        translated_body = litellm.AnthropicConfig().translate_anthropic_to_openai(
            anthropic_message_request=request_body
        )
        return translated_body
    def translate_completion_output_params(
        self, response: litellm.ModelResponse
    ) -> Optional[AnthropicResponse]:
        return litellm.AnthropicConfig().translate_openai_response_to_anthropic(
            response=response
        )
    def translate_completion_output_params_streaming(self) -> Optional[BaseModel]:
        return super().translate_completion_output_params_streaming()
 anthropic_adapter = AnthropicAdapter()
 ###########
 # TEST IT # 
 ###########
 ## register CUSTOM ADAPTER
 litellm.adapters = [{"id": "anthropic", "adapter": anthropic_adapter}]
 ## set ENV variables
 os.environ["OPENAI_API_KEY"] = "your-openai-key"
 os.environ["COHERE_API_KEY"] = "your-cohere-key"
 messages = [{ "content": "Hello, how are you?","role": "user"}]
 # openai call
 response = adapter_completion(model="gpt-3.5-turbo", messages=messages, adapter_id="anthropic")
 # cohere call
 response = adapter_completion(model="command-nightly", messages=messages, adapter_id="anthropic")
 print(response)
 ```
 ### 2. Create new endpoint
 We pass the custom callback class defined in Step1 to the config.yaml. Set callbacks to python_filename.logger_instance_name
 In the config below, we pass
 python_filename: `custom_callbacks.py`
 logger_instance_name: `anthropic_adapter`. This is defined in Step 1
 `target: custom_callbacks.proxy_handler_instance`
 ```yaml
 model_list:
  - model_name: my-fake-claude-endpoint
    litellm_params:
      model: gpt-3.5-turbo
      api_key: os.environ/OPENAI_API_KEY
 general_settings:
  master_key: sk-1234
  pass_through_endpoints:
    - path: "/v1/messages"                 # route you want to add to LiteLLM Proxy Server
      target: custom_callbacks.anthropic_adapter          # Adapter to use for this route
      headers:
        litellm_user_api_key: "x-api-key" # Field in headers, containing LiteLLM Key
 ```
 ### 3. Test it! 
 **Start proxy**
 ```bash
 litellm --config /path/to/config.yaml
 ```
 **Curl**
 ```bash
 curl --location 'http://0.0.0.0:4000/v1/messages' \
 -H 'x-api-key: sk-1234' \
 -H 'anthropic-version: 2023-06-01' \ # ignored
 -H 'content-type: application/json' \
 -D '{
    "model": "my-fake-claude-endpoint",
    "max_tokens": 1024,
    "messages": [
        {"role": "user", "content": "Hello, world"}
    ]
 }'
 ```
--- a/docs/my-website/docs/proxy/pii_masking.md
+++ b/docs/my-website/docs/proxy/pii_masking.md
@ -180,3 +180,59 @@ chat_completion = client.chat.completions.create(
  "_response_ms": 1753.426
 }
 ```
 ## Turn on for logging only
 Only apply PII Masking before logging to Langfuse, etc.
 Not on the actual llm api request / response.
 :::note
 This is currently only applied for 
 - `/chat/completion` requests
 - on 'success' logging
 :::
 1. Setup config.yaml
 ```yaml
 litellm_settings:
  presidio_logging_only: true 
 model_list:
  - model_name: gpt-3.5-turbo
    litellm_params:
      model: gpt-3.5-turbo
      api_key: os.environ/OPENAI_API_KEY
 ```
 2. Start proxy
 ```bash
 litellm --config /path/to/config.yaml
 ```
 3. Test it! 
 ```bash
 curl -X POST 'http://0.0.0.0:4000/chat/completions' \
 -H 'Content-Type: application/json' \
 -H 'Authorization: Bearer sk-1234' \
 -D '{
  "model": "gpt-3.5-turbo",
  "messages": [
    {
      "role": "user",
      "content": "Hi, my name is Jane!"
    }
  ]
  }'
 ```
 **Expected Logged Response**
 ```
 Hi, my name is <PERSON>!
 ```
--- a/docs/my-website/docs/proxy/prod.md
+++ b/docs/my-website/docs/proxy/prod.md
@ -84,6 +84,20 @@ Set `export LITELLM_MODE="PRODUCTION"`
 This disables the load_dotenv() functionality, which will automatically load your environment credentials from the local `.env`. 
 ## 5. Set LiteLLM Salt Key 
 If you plan on using the DB, set a salt key for encrypting/decrypting variables in the DB. 
 Do not change this after adding a model. It is used to encrypt / decrypt your LLM API Key credentials
 We recommned - https://1password.com/password-generator/ password generator to get a random hash for litellm salt key.
 ```bash
 export LITELLM_SALT_KEY="sk-1234"
 ```
 [**See Code**](https://github.com/BerriAI/litellm/blob/036a6821d588bd36d170713dcf5a72791a694178/litellm/proxy/common_utils/encrypt_decrypt_utils.py#L15)
 ## Extras
 ### Expected Performance in Production
--- a/docs/my-website/docs/proxy/prometheus.md
+++ b/docs/my-website/docs/proxy/prometheus.md
@ -1,7 +1,16 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
-# 📈 Prometheus metrics [BETA]
+# 📈 [BETA] Prometheus metrics
 :::info
 🚨 Prometheus metrics will be out of Beta on September 15, 2024 - as part of this release it will be on LiteLLM Enterprise starting at $250/mo
 [Enterprise Pricing](https://www.litellm.ai/#pricing)
 [Contact us here to get a free trial](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
 :::
 LiteLLM Exposes a `/metrics` endpoint for Prometheus to Poll
@ -47,9 +56,11 @@ http://localhost:4000/metrics
 # <proxy_base_url>/metrics
 ```
-## Metrics Tracked 
+## 📈 Metrics Tracked 
 ### Proxy Requests / Spend Metrics
 | Metric Name          | Description                          |
 |----------------------|--------------------------------------|
 | `litellm_requests_metric`             | Number of requests made, per `"user", "key", "model", "team", "end-user"`          |
@ -57,6 +68,23 @@ http://localhost:4000/metrics
 | `litellm_total_tokens`         | input + output tokens per `"user", "key", "model", "team", "end-user"`     |
 | `litellm_llm_api_failed_requests_metric`   | Number of failed LLM API requests per `"user", "key", "model", "team", "end-user"`    |
 ### LLM API / Provider Metrics
 | Metric Name          | Description                          |
 |----------------------|--------------------------------------|
 | `litellm_deployment_state`             | The state of the deployment: 0 = healthy, 1 = partial outage, 2 = complete outage. |
 | `litellm_remaining_requests_metric`             | Track `x-ratelimit-remaining-requests` returned from LLM API Deployment |
 | `litellm_remaining_tokens`                | Track `x-ratelimit-remaining-tokens` return from LLM API Deployment |
 `litellm_deployment_success_responses`              |  Total number of successful LLM API calls for deployment                               |
 | `litellm_deployment_failure_responses`              | Total number of failed LLM API calls for deployment                                   |
 | `litellm_deployment_total_requests`                 | Total number of LLM API calls for deployment - success + failure                      |
 | `litellm_deployment_latency_per_output_token`       | Latency per output token for deployment                                                          |
 | `litellm_deployment_successful_fallbacks`           |  Number of successful fallback requests from primary model -> fallback model        |
 | `litellm_deployment_failed_fallbacks`               | Number of failed fallback requests from primary model -> fallback model            |
 ### Budget Metrics
 | Metric Name          | Description                          |
 |----------------------|--------------------------------------|
@ -64,55 +92,6 @@ http://localhost:4000/metrics
 | `litellm_remaining_api_key_budget_metric`                | Remaining Budget for API Key (A key Created on LiteLLM)|
 ### ✨ (Enterprise) LLM Remaining Requests and Remaining Tokens
 Set this on your config.yaml to allow you to track how close you are to hitting your TPM / RPM limits on each model group 
 ```yaml
 litellm_settings:
  success_callback: ["prometheus"]
  failure_callback: ["prometheus"]
  return_response_headers: true # ensures the LLM API calls track the response headers
 ```
 | Metric Name          | Description                          |
 |----------------------|--------------------------------------|
 | `litellm_remaining_requests_metric`             | Track `x-ratelimit-remaining-requests` returned from LLM API Deployment |
 | `litellm_remaining_tokens`                | Track `x-ratelimit-remaining-tokens` return from LLM API Deployment |
 Example Metric
 <Tabs>
 <TabItem value="Remaining Requests" label="Remaining Requests">
 ```shell
 litellm_remaining_requests
 {
  api_base="https://api.openai.com/v1",
  api_provider="openai",
  litellm_model_name="gpt-3.5-turbo",
  model_group="gpt-3.5-turbo"
 } 
 8998.0
 ```
 </TabItem>
 <TabItem value="Requests" label="Remaining Tokens">
 ```shell
 litellm_remaining_tokens
 {
  api_base="https://api.openai.com/v1",
  api_provider="openai",
  litellm_model_name="gpt-3.5-turbo",
  model_group="gpt-3.5-turbo"
 } 
 999981.0
 ```
 </TabItem>
 </Tabs>
 ## Monitor System Health
--- a/docs/my-website/docs/proxy/prompt_injection.md
+++ b/docs/my-website/docs/proxy/prompt_injection.md
@ -13,20 +13,23 @@ LiteLLM Supports the following methods for detecting prompt injection attacks
 Use this if you want to reject /chat, /completions, /embeddings calls that have prompt injection attacks
-LiteLLM uses [LakerAI API](https://platform.lakera.ai/) to detect if a request has a prompt injection attack
+LiteLLM uses [LakeraAI API](https://platform.lakera.ai/) to detect if a request has a prompt injection attack
-#### Usage
+### Usage
 Step 1 Set a `LAKERA_API_KEY` in your env
 ```
 LAKERA_API_KEY="7a91a1a6059da*******"
 ```
-Step 2. Add `lakera_prompt_injection` to your calbacks
+Step 2. Add `lakera_prompt_injection` as a guardrail
 ```yaml 
 litellm_settings:
-  callbacks: ["lakera_prompt_injection"]
+  guardrails:
    - prompt_injection:  # your custom name for guardrail
        callbacks: ["lakera_prompt_injection"] # litellm callbacks to use
        default_on: true # will run on all llm requests when true
 ```
 That's it, start your proxy
@ -48,6 +51,48 @@ curl --location 'http://localhost:4000/chat/completions' \
 }'
 ```
 ### Advanced - set category-based thresholds.
 Lakera has 2 categories for prompt_injection attacks:
 - jailbreak
 - prompt_injection
 ```yaml 
 litellm_settings:
  guardrails:
    - prompt_injection:  # your custom name for guardrail
        callbacks: ["lakera_prompt_injection"] # litellm callbacks to use
        default_on: true # will run on all llm requests when true
        callback_args:
          lakera_prompt_injection:
            category_thresholds: {
                            "prompt_injection": 0.1,
                            "jailbreak": 0.1,
                        }
 ```
 ### Advanced - Run before/in-parallel to request.
 Control if the Lakera prompt_injection check runs before a request or in parallel to it (both requests need to be completed before a response is returned to the user).
 ```yaml 
 litellm_settings:
  guardrails:
    - prompt_injection:  # your custom name for guardrail
        callbacks: ["lakera_prompt_injection"] # litellm callbacks to use
        default_on: true # will run on all llm requests when true
        callback_args: 
          lakera_prompt_injection: {"moderation_check": "in_parallel"}, # "pre_call", "in_parallel"
 ```
 ### Advanced - set custom API Base.
 ```bash
 export LAKERA_API_BASE=""
 ```
 [**Learn More**](./guardrails.md)
 ## Similarity Checking
 LiteLLM supports similarity checking against a pre-generated list of prompt injection attacks, to identify if a request contains an attack. 
--- a/docs/my-website/docs/proxy/quick_start.md
+++ b/docs/my-website/docs/proxy/quick_start.md
@ -5,7 +5,7 @@ import TabItem from '@theme/TabItem';
 # Quick Start
 Quick start CLI, Config, Docker
-LiteLLM Server manages:
+LiteLLM Server (LLM Gateway) manages:
 * **Unified Interface**: Calling 100+ LLMs [Huggingface/Bedrock/TogetherAI/etc.](#other-supported-models) in the OpenAI `ChatCompletions` & `Completions` format
 * **Cost tracking**: Authentication, Spend Tracking & Budgets [Virtual Keys](https://docs.litellm.ai/docs/proxy/virtual_keys)
@ -243,7 +243,8 @@ model_list:
  - model_name: vllm-model
    litellm_params:
      model: openai/<your-model-name>
-      api_base: <your-api-base> # e.g. http://0.0.0.0:3000
+      api_base: <your-vllm-api-base> # e.g. http://0.0.0.0:3000/v1
      api_key: <your-vllm-api-key|none>
 ```
 ### Run proxy with config
@ -255,6 +256,12 @@ litellm --config your_config.yaml
 ## Using LiteLLM Proxy - Curl Request, OpenAI Package, Langchain
 :::info
 LiteLLM is compatible with several SDKs - including OpenAI SDK, Anthropic SDK, Mistral SDK, LLamaIndex, Langchain (Js, Python)
 [More examples here](user_keys)
 :::
 <Tabs>
 <TabItem value="Curl" label="Curl Request">
@ -382,6 +389,34 @@ print(response)
 ```
 </TabItem>
 <TabItem value="anthropic-py" label="Anthropic Python SDK">
 ```python
 import os
 from anthropic import Anthropic
 client = Anthropic(
    base_url="http://localhost:4000", # proxy endpoint
    api_key="sk-s4xN1IiLTCytwtZFJaYQrA", # litellm proxy virtual key
 )
 message = client.messages.create(
    max_tokens=1024,
    messages=[
        {
            "role": "user",
            "content": "Hello, Claude",
        }
    ],
    model="claude-3-opus-20240229",
 )
 print(message.content)
 ```
 </TabItem>
 </Tabs>
 [**More Info**](./configs.md)
@ -396,165 +431,6 @@ print(response)
 - POST `/key/generate` - generate a key to access the proxy
 ## Using with OpenAI compatible projects
 Set `base_url` to the LiteLLM Proxy server
 <Tabs>
 <TabItem value="openai" label="OpenAI v1.0.0+">
 ```python
 import openai
 client = openai.OpenAI(
    api_key="anything",
    base_url="http://0.0.0.0:4000"
 )
 # request sent to model set on litellm proxy, `litellm --model`
 response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
    {
        "role": "user",
        "content": "this is a test request, write a short poem"
    }
 ])
 print(response)
 ```
 </TabItem>
 <TabItem value="librechat" label="LibreChat">
 #### Start the LiteLLM proxy
 ```shell
 litellm --model gpt-3.5-turbo
 #INFO: Proxy running on http://0.0.0.0:4000
 ```
 #### 1. Clone the repo
 ```shell
 git clone https://github.com/danny-avila/LibreChat.git
 ```
 #### 2. Modify Librechat's `docker-compose.yml`
 LiteLLM Proxy is running on port `4000`, set `4000` as the proxy below
 ```yaml
 OPENAI_REVERSE_PROXY=http://host.docker.internal:4000/v1/chat/completions
 ```
 #### 3. Save fake OpenAI key in Librechat's `.env` 
 Copy Librechat's `.env.example` to `.env` and overwrite the default OPENAI_API_KEY (by default it requires the user to pass a key).
 ```env
 OPENAI_API_KEY=sk-1234
 ```
 #### 4. Run LibreChat: 
 ```shell
 docker compose up
 ```
 </TabItem>
 <TabItem value="continue-dev" label="ContinueDev">
 Continue-Dev brings ChatGPT to VSCode. See how to [install it here](https://continue.dev/docs/quickstart).
 In the [config.py](https://continue.dev/docs/reference/Models/openai) set this as your default model.
 ```python
  default=OpenAI(
      api_key="IGNORED",
      model="fake-model-name",
      context_length=2048, # customize if needed for your model
      api_base="http://localhost:4000" # your proxy server url
  ),
 ```
 Credits [@vividfog](https://github.com/ollama/ollama/issues/305#issuecomment-1751848077) for this tutorial. 
 </TabItem>
 <TabItem value="aider" label="Aider">
 ```shell
 $ pip install aider 
 $ aider --openai-api-base http://0.0.0.0:4000 --openai-api-key fake-key
 ```
 </TabItem>
 <TabItem value="autogen" label="AutoGen">
 ```python
 pip install pyautogen
 ```
 ```python
 from autogen import AssistantAgent, UserProxyAgent, oai
 config_list=[
    {
        "model": "my-fake-model",
        "api_base": "http://localhost:4000",  #litellm compatible endpoint
        "api_type": "open_ai",
        "api_key": "NULL", # just a placeholder
    }
 ]
 response = oai.Completion.create(config_list=config_list, prompt="Hi")
 print(response) # works fine
 llm_config={
    "config_list": config_list,
 }
 assistant = AssistantAgent("assistant", llm_config=llm_config)
 user_proxy = UserProxyAgent("user_proxy")
 user_proxy.initiate_chat(assistant, message="Plot a chart of META and TESLA stock price change YTD.", config_list=config_list)
 ```
 Credits [@victordibia](https://github.com/microsoft/autogen/issues/45#issuecomment-1749921972) for this tutorial.
 </TabItem>
 <TabItem value="guidance" label="guidance">
 A guidance language for controlling large language models.
 https://github.com/guidance-ai/guidance
 **NOTE:** Guidance sends additional params like `stop_sequences` which can cause some models to fail if they don't support it. 
 **Fix**: Start your proxy using the `--drop_params` flag
 ```shell
 litellm --model ollama/codellama --temperature 0.3 --max_tokens 2048 --drop_params
 ```
 ```python
 import guidance
 # set api_base to your proxy
 # set api_key to anything
 gpt4 = guidance.llms.OpenAI("gpt-4", api_base="http://0.0.0.0:4000", api_key="anything")
 experts = guidance('''
 {{#system~}}
 You are a helpful and terse assistant.
 {{~/system}}
 {{#user~}}
 I want a response to the following question:
 {{query}}
 Name 3 world-class experts (past or present) who would be great at answering this?
 Don't answer the question yet.
 {{~/user}}
 {{#assistant~}}
 {{gen 'expert_names' temperature=0 max_tokens=300}}
 {{~/assistant}}
 ''', llm=gpt4)
 result = experts(query='How can I be more productive?')
 print(result)
 ```
 </TabItem>
 </Tabs>
 ## Debugging Proxy 
 Events that occur during normal operation
--- a/docs/my-website/docs/proxy/reliability.md
+++ b/docs/my-website/docs/proxy/reliability.md
@ -31,15 +31,26 @@ model_list:
      api_base: https://openai-france-1234.openai.azure.com/
      api_key: <your-azure-api-key>
      rpm: 1440
 routing_strategy: simple-shuffle # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle"
  model_group_alias: {"gpt-4": "gpt-3.5-turbo"} # all requests with `gpt-4` will be routed to models with `gpt-3.5-turbo`
  num_retries: 2
  timeout: 30                                  # 30 seconds
  redis_host: <your redis host>                # set this when using multiple litellm proxy deployments, load balancing state stored in redis
  redis_password: <your redis password>
  redis_port: 1992
 ```
 :::info
 Detailed information about [routing strategies can be found here](../routing)
 :::
 #### Step 2: Start Proxy with config
 ```shell
 $ litellm --config /path/to/config.yaml
 ```
-### Test - Load Balancing
+### Test - Simple Call
 Here requests with model=gpt-3.5-turbo will be routed across multiple instances of azure/gpt-3.5-turbo
@ -127,6 +138,27 @@ print(response)
 </Tabs>
 ### Test - Loadbalancing
 In this request, the following will occur:
 1. A rate limit exception will be raised 
 2. LiteLLM proxy will retry the request on the model group (default is 3).
 ```bash
 curl -X POST 'http://0.0.0.0:4000/chat/completions' \
 -H 'Content-Type: application/json' \
 -H 'Authorization: Bearer sk-1234' \
 -d '{
  "model": "gpt-3.5-turbo",
  "messages": [
        {"role": "user", "content": "Hi there!"}
    ],
    "mock_testing_rate_limit_error": true
 }'
 ```
 [**See Code**](https://github.com/BerriAI/litellm/blob/6b8806b45f970cb2446654d2c379f8dcaa93ce3c/litellm/router.py#L2535)
 ### Test - Client Side Fallbacks
 In this request the following will occur:
 1. The request to `model="zephyr-beta"` will fail
@ -434,6 +466,33 @@ litellm_settings:
 ### Default Fallbacks 
 You can also set default_fallbacks, in case a specific model group is misconfigured / bad.
 ```yaml
 model_list:
 	- model_name: gpt-3.5-turbo-small
 	  litellm_params:
 		model: azure/chatgpt-v-2
        api_base: os.environ/AZURE_API_BASE
        api_key: os.environ/AZURE_API_KEY
        api_version: "2023-07-01-preview"
    - model_name: claude-opus
      litellm_params:
        model: claude-3-opus-20240229
        api_key: os.environ/ANTHROPIC_API_KEY
 litellm_settings:
  default_fallbacks: ["claude-opus"]
 ```
 This will default to claude-opus in case any model fails.
 A model-specific fallbacks (e.g. {"gpt-3.5-turbo-small": ["claude-opus"]}) overrides default fallback.
 ### Test Fallbacks! 
 Check if your fallbacks are working as expected. 
--- a/docs/my-website/docs/proxy/self_serve.md
+++ b/docs/my-website/docs/proxy/self_serve.md
@ -4,7 +4,7 @@ import TabItem from '@theme/TabItem';
 # 🤗 UI - Self-Serve
-Allow users to create their own keys on [Proxy UI](./ui.md).
+## Allow users to create their own keys on [Proxy UI](./ui.md).
 1. Add user with permissions to a team on proxy 
@ -125,6 +125,41 @@ LiteLLM Enterprise: Enable [SSO login](./ui.md#setup-ssoauth-for-ui)
 <Image img={require('../../img/ui_self_serve_create_key.png')}  style={{ width: '800px', height: 'auto' }} />
 ## Allow users to View Usage, Caching Analytics
 1. Go to Internal Users -> +Invite User
 Set their role to `Admin Viewer` - this means they can only view usage, caching analytics
 <Image img={require('../../img/ui_invite_user.png')}  style={{ width: '800px', height: 'auto' }} />
 <br />
 2. Share invitation link with user
 <Image img={require('../../img/ui_invite_link.png')}  style={{ width: '800px', height: 'auto' }} />
 <br />
 3. User logs in via email + password auth
 <Image img={require('../../img/ui_clean_login.png')}  style={{ width: '500px', height: 'auto' }} />
 <br />
 4. User can now view Usage, Caching Analytics
 <Image img={require('../../img/ui_usage.png')}  style={{ width: '800px', height: 'auto' }} />
 ## Available Roles
 Here's the available UI roles for a LiteLLM Internal User: 
 **Admin Roles:**
  - `proxy_admin`: admin over the platform
  - `proxy_admin_viewer`: can login, view all keys, view all spend. **Cannot** create/delete keys, add new users.
 **Internal User Roles:**
  - `internal_user`: can login, view/create/delete their own keys, view their spend. **Cannot** add new users.
  - `internal_user_viewer`: can login, view their own keys, view their own spend. **Cannot** create/delete keys, add new users.
 ## Advanced
 ### Setting custom logout URLs
@ -138,3 +173,24 @@ export PROXY_LOGOUT_URL="https://www.google.com"
 <Image img={require('../../img/ui_logout.png')}  style={{ width: '400px', height: 'auto' }} />
 ### Set max budget for internal users 
 Automatically apply budget per internal user when they sign up
 ```yaml
 litellm_settings:
  max_internal_user_budget: 10
  internal_user_budget_duration: "1mo" # reset every month
 ```
 This sets a max budget of $10 USD for internal users when they sign up. 
 This budget only applies to personal keys created by that user - seen under `Default Team` on the UI. 
 <Image img={require('../../img/max_budget_for_internal_users.png')}  style={{ width: '500px', height: 'auto' }} />
 This budget does not apply to keys created under non-default teams.
 ### Set max budget for teams
 [**Go Here**](./team_budgets.md)
--- a/docs/my-website/docs/proxy/streaming_logging.md
+++ b/docs/my-website/docs/proxy/streaming_logging.md
@ -8,6 +8,7 @@ Define your custom callback class in a python file.
 ```python
 from litellm.integrations.custom_logger import CustomLogger
 import litellm
 import logging
 # This file includes the custom callbacks for LiteLLM Proxy
 # Once defined, these can be passed in proxy_config.yaml
@ -25,9 +26,9 @@ class MyCustomHandler(CustomLogger):
                    datefmt='%Y-%m-%d %H:%M:%S'
            )
-            response_cost = litellm.completion_cost(completion_response=completion_response)
+            response_cost: Optional[float] = kwargs.get("response_cost", None)
            print("regular response_cost", response_cost)
-            logging.info(f"Model {completion_response.model} Cost: ${response_cost:.8f}")
+            logging.info(f"Model {response_obj.model} Cost: ${response_cost:.8f}")
        except:
            pass
--- a/docs/my-website/docs/proxy/tag_routing.md
+++ b/docs/my-website/docs/proxy/tag_routing.md
@ -0,0 +1,133 @@
 # Tag Based Routing
 Route requests based on tags. 
 This is useful for implementing free / paid tiers for users
 ### 1. Define tags on config.yaml 
 - A request with `tags=["free"]` will get routed to `openai/fake`
 - A request with `tags=["paid"]`  will get routed to `openai/gpt-4o`
 ```yaml
 model_list:
  - model_name: gpt-4
    litellm_params:
      model: openai/fake
      api_key: fake-key
      api_base: https://exampleopenaiendpoint-production.up.railway.app/
      tags: ["free"] # 👈 Key Change
  - model_name: gpt-4
    litellm_params:
      model: openai/gpt-4o
      api_key: os.environ/OPENAI_API_KEY
      tags: ["paid"] # 👈 Key Change
 router_settings:
  enable_tag_filtering: True # 👈 Key Change
 general_settings: 
  master_key: sk-1234 
 ```
 ### 2. Make Request with `tags=["free"]`
 This request includes "tags": ["free"], which routes it to `openai/fake`
 ```shell
 curl -i http://localhost:4000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer sk-1234" \
  -d '{
    "model": "gpt-4",
    "messages": [
      {"role": "user", "content": "Hello, Claude gm!"}
    ],
    "tags": ["free"]
  }'
 ```
 **Expected Response**
 Expect to see the following response header when this works
 ```shell
 x-litellm-model-api-base: https://exampleopenaiendpoint-production.up.railway.app/
 ```
 Response
 ```shell
 {
 "id": "chatcmpl-33c534e3d70148218e2d62496b81270b",
 "choices": [
   {
     "finish_reason": "stop",
     "index": 0,
     "message": {
       "content": "\n\nHello there, how may I assist you today?",
       "role": "assistant",
       "tool_calls": null,
       "function_call": null
     }
   }
 ],
 "created": 1677652288,
 "model": "gpt-3.5-turbo-0125",
 "object": "chat.completion",
 "system_fingerprint": "fp_44709d6fcb",
 "usage": {
   "completion_tokens": 12,
   "prompt_tokens": 9,
   "total_tokens": 21
 }
 }
 ```
 ### 3. Make Request with `tags=["paid"]`
 This request includes "tags": ["paid"], which routes it to `openai/gpt-4`
 ```shell
 curl -i http://localhost:4000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer sk-1234" \
  -d '{
    "model": "gpt-4",
    "messages": [
      {"role": "user", "content": "Hello, Claude gm!"}
    ],
    "tags": ["paid"]
  }'
 ```
 **Expected Response**
 Expect to see the following response header when this works
 ```shell
 x-litellm-model-api-base: https://api.openai.com
 ```
 Response
 ```shell
 {
 "id": "chatcmpl-9maCcqQYTqdJrtvfakIawMOIUbEZx",
 "choices": [
   {
     "finish_reason": "stop",
     "index": 0,
     "message": {
       "content": "Good morning! How can I assist you today?",
       "role": "assistant",
       "tool_calls": null,
       "function_call": null
     }
   }
 ],
 "created": 1721365934,
 "model": "gpt-4o-2024-05-13",
 "object": "chat.completion",
 "system_fingerprint": "fp_c4e5b6fa31",
 "usage": {
   "completion_tokens": 10,
   "prompt_tokens": 12,
   "total_tokens": 22
 }
 }
 ```
--- a/docs/my-website/docs/proxy/team_based_routing.md
+++ b/docs/my-website/docs/proxy/team_based_routing.md
@ -1,4 +1,4 @@
-# 👥 Team-based Routing + Logging
+# 👥 Team-based Routing
 ## Routing
 Route calls to different model groups based on the team-id
@ -71,35 +71,3 @@ curl --location 'http://0.0.0.0:4000/v1/chat/completions' \
 }'
 ```
 ## Logging / Caching
 Turn on/off logging and caching for a specific team id. 
 **Example:**
 This config would send langfuse logs to 2 different langfuse projects, based on the team id 
 ```yaml
 litellm_settings:
  default_team_settings: 
    - team_id: my-secret-project
      success_callback: ["langfuse"]
      langfuse_public_key: os.environ/LANGFUSE_PUB_KEY_1 # Project 1
      langfuse_secret: os.environ/LANGFUSE_PRIVATE_KEY_1 # Project 1
    - team_id: ishaans-secret-project
      success_callback: ["langfuse"]
      langfuse_public_key: os.environ/LANGFUSE_PUB_KEY_2 # Project 2
      langfuse_secret: os.environ/LANGFUSE_SECRET_2 # Project 2
 ```
 Now, when you [generate keys](./virtual_keys.md) for this team-id 
 ```bash
 curl -X POST 'http://0.0.0.0:4000/key/generate' \
 -H 'Authorization: Bearer sk-1234' \
 -H 'Content-Type: application/json' \
 -d '{"team_id": "ishaans-secret-project"}'
 ```
 All requests made with these keys will log data to their team-specific logging.
--- a/docs/my-website/docs/proxy/team_logging.md
+++ b/docs/my-website/docs/proxy/team_logging.md
@ -0,0 +1,227 @@
 import Image from '@theme/IdealImage';
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # 👥📊 Team/Key Based Logging
 Allow each key/team to use their own Langfuse Project / custom callbacks
 **This allows you to do the following**
 ```
 Team 1 -> Logs to Langfuse Project 1 
 Team 2 -> Logs to Langfuse Project 2
 Team 3 -> Disabled Logging (for GDPR compliance)
 ```
 ## Team Based Logging
 [👉 Tutorial - Allow each team to use their own Langfuse Project / custom callbacks](team_logging.md)
 ## Logging / Caching
 Turn on/off logging and caching for a specific team id. 
 **Example:**
 This config would send langfuse logs to 2 different langfuse projects, based on the team id 
 ```yaml
 litellm_settings:
  default_team_settings: 
    - team_id: my-secret-project
      success_callback: ["langfuse"]
      langfuse_public_key: os.environ/LANGFUSE_PUB_KEY_1 # Project 1
      langfuse_secret: os.environ/LANGFUSE_PRIVATE_KEY_1 # Project 1
    - team_id: ishaans-secret-project
      success_callback: ["langfuse"]
      langfuse_public_key: os.environ/LANGFUSE_PUB_KEY_2 # Project 2
      langfuse_secret: os.environ/LANGFUSE_SECRET_2 # Project 2
 ```
 Now, when you [generate keys](./virtual_keys.md) for this team-id 
 ```bash
 curl -X POST 'http://0.0.0.0:4000/key/generate' \
 -H 'Authorization: Bearer sk-1234' \
 -H 'Content-Type: application/json' \
 -d '{"team_id": "ishaans-secret-project"}'
 ```
 All requests made with these keys will log data to their team-specific logging. -->
 ## [BETA] Team Logging via API 
 :::info
 ✨ This is an Enterprise only feature [Get Started with Enterprise here](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
 :::
 ### Set Callbacks Per Team
 #### 1. Set callback for team 
 We make a request to `POST /team/{team_id}/callback` to add a callback for
 ```shell
 curl -X POST 'http:/localhost:4000/team/dbe2f686-a686-4896-864a-4c3924458709/callback' \
 -H 'Content-Type: application/json' \
 -H 'Authorization: Bearer sk-1234' \
 -d '{
  "callback_name": "langfuse",
  "callback_type": "success",
  "callback_vars": {
    "langfuse_public_key": "pk", 
    "langfuse_secret_key": "sk_", 
    "langfuse_host": "https://cloud.langfuse.com"
    }
 }'
 ```
 ##### Supported Values
 | Field | Supported Values | Notes |
 |-------|------------------|-------|
 | `callback_name` | `"langfuse"` | Currently only supports "langfuse" |
 | `callback_type` | `"success"`, `"failure"`, `"success_and_failure"` | |
 | `callback_vars` | | dict of callback settings |
 | &nbsp;&nbsp;&nbsp;&nbsp;`langfuse_public_key` | string | Required |
 | &nbsp;&nbsp;&nbsp;&nbsp;`langfuse_secret_key` | string | Required |
 | &nbsp;&nbsp;&nbsp;&nbsp;`langfuse_host` | string | Optional (defaults to https://cloud.langfuse.com) |
 #### 2. Create key for team
 All keys created for team `dbe2f686-a686-4896-864a-4c3924458709` will log to langfuse project specified on [Step 1. Set callback for team](#1-set-callback-for-team)
 ```shell
 curl --location 'http://0.0.0.0:4000/key/generate' \
    --header 'Authorization: Bearer sk-1234' \
    --header 'Content-Type: application/json' \
    --data '{
        "team_id": "dbe2f686-a686-4896-864a-4c3924458709"
 }'
 ```
 #### 3. Make `/chat/completion` request for team
 ```shell
 curl -i http://localhost:4000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer sk-KbUuE0WNptC0jXapyMmLBA" \
  -d '{
    "model": "gpt-4",
    "messages": [
      {"role": "user", "content": "Hello, Claude gm!"}
    ]
 }'
 ```
 Expect this to be logged on the langfuse project specified on [Step 1. Set callback for team](#1-set-callback-for-team)
 ### Disable Logging for a Team
 To disable logging for a specific team, you can use the following endpoint:
 `POST /team/{team_id}/disable_logging`
 This endpoint removes all success and failure callbacks for the specified team, effectively disabling logging.
 #### Step 1. Disable logging for team
 ```shell
 curl -X POST 'http://localhost:4000/team/YOUR_TEAM_ID/disable_logging' \
    -H 'Authorization: Bearer YOUR_API_KEY'
 ```
 Replace YOUR_TEAM_ID with the actual team ID
 **Response**
 A successful request will return a response similar to this:
 ```json
 {
    "status": "success",
    "message": "Logging disabled for team YOUR_TEAM_ID",
    "data": {
        "team_id": "YOUR_TEAM_ID",
        "success_callbacks": [],
        "failure_callbacks": []
    }
 }
 ```
 #### Step 2. Test it - `/chat/completions`
 Use a key generated for team = `team_id` - you should see no logs on your configured success callback (eg. Langfuse)
 ```shell
 curl -i http://localhost:4000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer sk-KbUuE0WNptC0jXapyMmLBA" \
  -d '{
    "model": "gpt-4",
    "messages": [
      {"role": "user", "content": "Hello, Claude gm!"}
    ]
 }'
 ```
 #### Debugging / Troubleshooting
 - Check active callbacks for team using `GET /team/{team_id}/callback`
 Use this to check what success/failure callbacks are active for team=`team_id`
 ```shell
 curl -X GET 'http://localhost:4000/team/dbe2f686-a686-4896-864a-4c3924458709/callback' \
        -H 'Authorization: Bearer sk-1234'
 ```
 ### Team Logging Endpoints
 - [`POST /team/{team_id}/callback` Add a success/failure callback to a team](https://litellm-api.up.railway.app/#/team%20management/add_team_callbacks_team__team_id__callback_post)
 - [`GET /team/{team_id}/callback` - Get the success/failure callbacks and variables for a team](https://litellm-api.up.railway.app/#/team%20management/get_team_callbacks_team__team_id__callback_get)
 ## [BETA] Key Based Logging 
 Use the `/key/generate` or `/key/update` endpoints to add logging callbacks to a specific key.
 :::info
 ✨ This is an Enterprise only feature [Get Started with Enterprise here](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
 :::
 ```bash
 curl -X POST 'http://0.0.0.0:4000/key/generate' \
 -H 'Authorization: Bearer sk-1234' \
 -H 'Content-Type: application/json' \
 -d '{
    "metadata": {
        "logging": [{
            "callback_name": "langfuse", # 'otel', 'langfuse', 'lunary'
            "callback_type": "success" # set, if required by integration - future improvement, have logging tools work for success + failure by default 
            "callback_vars": {
                "langfuse_public_key": "os.environ/LANGFUSE_PUBLIC_KEY", # [RECOMMENDED] reference key in proxy environment
                "langfuse_secret_key": "os.environ/LANGFUSE_SECRET_KEY", # [RECOMMENDED] reference key in proxy environment
                "langfuse_host": "https://cloud.langfuse.com"
            }
        }]
    }
 }'
 ```
 ---
 Help us improve this feature, by filing a [ticket here](https://github.com/BerriAI/litellm/issues)
--- a/docs/my-website/docs/proxy/ui.md
+++ b/docs/my-website/docs/proxy/ui.md
@ -53,6 +53,12 @@ UI_PASSWORD=langchain        # password to sign in on UI
 On accessing the LiteLLM UI, you will be prompted to enter your username, password
 ## Invite-other users 
 Allow others to create/delete their own keys. 
 [**Go Here**](./self_serve.md)
 ## ✨ Enterprise Features
 Features here are behind a commercial license in our `/enterprise` folder. [**See Code**](https://github.com/BerriAI/litellm/tree/main/enterprise)
@ -76,6 +82,13 @@ litellm_settings:
 - Key will be created with `max_budget=100` since 100 is the upper bound
 #### Step 2: Setup Oauth Client
 :::tip
 Looking for how to use Oauth 2.0 for /chat, /completions API requests to the proxy? [Follow this doc](oauth2)
 :::
 <Tabs>
 <TabItem value="okta" label="Okta SSO">
@ -186,6 +199,16 @@ PROXY_BASE_URL=https://litellm-api.up.railway.app/
 #### Step 4. Test flow
 <Image img={require('../../img/litellm_ui_3.gif')} />
 ### Restrict Email Subdomains w/ SSO
 If you're using SSO and want to only allow users with a specific subdomain - e.g. (@berri.ai email accounts) to access the UI, do this:
 ```bash
 export ALLOWED_EMAIL_DOMAINS="berri.ai"
 ```
 This will check if the user email we receive from SSO contains this domain, before allowing access.
 ### Set Admin view w/ SSO 
 You just need to set Proxy Admin ID
--- a/docs/my-website/docs/proxy/user_keys.md
+++ b/docs/my-website/docs/proxy/user_keys.md
@ -1,7 +1,43 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
-# Use with Langchain, OpenAI SDK, LlamaIndex, Instructor, Curl
+# 💡 Migrating from OpenAI (Langchain, OpenAI SDK, LlamaIndex, Instructor, Curl)
 LiteLLM Proxy is **OpenAI-Compatible**, and supports:
 * /chat/completions 
 * /embeddings
 * /completions 
 * /image/generations 
 * /moderations 
 * /audio/transcriptions
 * /audio/speech
 * [Assistants API endpoints](https://docs.litellm.ai/docs/assistants)
 * [Batches API endpoints](https://docs.litellm.ai/docs/batches)
 * [Fine-Tuning API endpoints](https://docs.litellm.ai/docs/fine_tuning)
 LiteLLM Proxy is **Azure OpenAI-compatible**:
 * /chat/completions
 * /completions
 * /embeddings 
 LiteLLM Proxy is **Anthropic-compatible**: 
 * /messages 
 LiteLLM Proxy is **Vertex AI compatible**:
 - [Supports ALL Vertex Endpoints](../vertex_ai)
 This doc covers:
 *   /chat/completion
 *   /embedding
 These are **selected examples**. LiteLLM Proxy is **OpenAI-Compatible**, it works with any project that calls OpenAI. Just change the `base_url`, `api_key` and `model`.
 To pass provider-specific args, [go here](https://docs.litellm.ai/docs/completion/provider_specific_params#proxy-usage)
 To drop unsupported params (E.g. frequency_penalty for bedrock with librechat), [go here](https://docs.litellm.ai/docs/completion/drop_params#openai-proxy-usage)
 :::info
@ -207,6 +243,81 @@ console.log(message);
 ```
 </TabItem>
 <TabItem value="openai JS" label="OpenAI JS">
 ```js
 const { OpenAI } = require('openai');
 const openai = new OpenAI({
  apiKey: "sk-1234", // This is the default and can be omitted
  baseURL: "http://0.0.0.0:4000"
 });
 async function main() {
  const chatCompletion = await openai.chat.completions.create({
    messages: [{ role: 'user', content: 'Say this is a test' }],
    model: 'gpt-3.5-turbo',
  }, {"metadata": {
            "generation_name": "ishaan-generation-openaijs-client",
            "generation_id": "openaijs-client-gen-id22",
            "trace_id": "openaijs-client-trace-id22",
            "trace_user_id": "openaijs-client-user-id2"
        }});
 }
 main();
 ```
 </TabItem>
 <TabItem value="anthropic-py" label="Anthropic Python SDK">
 ```python
 import os
 from anthropic import Anthropic
 client = Anthropic(
    base_url="http://localhost:4000", # proxy endpoint
    api_key="sk-s4xN1IiLTCytwtZFJaYQrA", # litellm proxy virtual key
 )
 message = client.messages.create(
    max_tokens=1024,
    messages=[
        {
            "role": "user",
            "content": "Hello, Claude",
        }
    ],
    model="claude-3-opus-20240229",
 )
 print(message.content)
 ```
 </TabItem>
 <TabItem value="mistral-py" label="Mistral Python SDK">
 ```python
 import os
 from mistralai.client import MistralClient
 from mistralai.models.chat_completion import ChatMessage
 client = MistralClient(api_key="sk-1234", endpoint="http://0.0.0.0:4000")
 chat_response = client.chat(
    model="mistral-small-latest",
    messages=[
        {"role": "user", "content": "this is a test request, write a short poem"}
    ],
 )
 print(chat_response.choices[0].message.content)
 ```
 </TabItem>
 <TabItem value="instructor" label="Instructor">
 ```python
@ -214,11 +325,12 @@ from openai import OpenAI
 import instructor
 from pydantic import BaseModel
-my_proxy_api_key = "" # e.g. sk-1234
+my_proxy_api_key = "" # e.g. sk-1234 - LITELLM KEY
-my_proxy_base_url = "" # e.g. http://0.0.0.0:4000
+my_proxy_base_url = "" # e.g. http://0.0.0.0:4000 - LITELLM PROXY BASE URL
 # This enables response_model keyword
 # from client.chat.completions.create
 ## WORKS ACROSS OPENAI/ANTHROPIC/VERTEXAI/ETC. - all LITELLM SUPPORTED MODELS!
 client = instructor.from_openai(OpenAI(api_key=my_proxy_api_key, base_url=my_proxy_base_url))
 class UserDetail(BaseModel):
@ -539,6 +651,166 @@ curl --location 'http://0.0.0.0:4000/moderations' \
 ```
 ## Using with OpenAI compatible projects
 Set `base_url` to the LiteLLM Proxy server
 <Tabs>
 <TabItem value="openai" label="OpenAI v1.0.0+">
 ```python
 import openai
 client = openai.OpenAI(
    api_key="anything",
    base_url="http://0.0.0.0:4000"
 )
 # request sent to model set on litellm proxy, `litellm --model`
 response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
    {
        "role": "user",
        "content": "this is a test request, write a short poem"
    }
 ])
 print(response)
 ```
 </TabItem>
 <TabItem value="librechat" label="LibreChat">
 #### Start the LiteLLM proxy
 ```shell
 litellm --model gpt-3.5-turbo
 #INFO: Proxy running on http://0.0.0.0:4000
 ```
 #### 1. Clone the repo
 ```shell
 git clone https://github.com/danny-avila/LibreChat.git
 ```
 #### 2. Modify Librechat's `docker-compose.yml`
 LiteLLM Proxy is running on port `4000`, set `4000` as the proxy below
 ```yaml
 OPENAI_REVERSE_PROXY=http://host.docker.internal:4000/v1/chat/completions
 ```
 #### 3. Save fake OpenAI key in Librechat's `.env` 
 Copy Librechat's `.env.example` to `.env` and overwrite the default OPENAI_API_KEY (by default it requires the user to pass a key).
 ```env
 OPENAI_API_KEY=sk-1234
 ```
 #### 4. Run LibreChat: 
 ```shell
 docker compose up
 ```
 </TabItem>
 <TabItem value="continue-dev" label="ContinueDev">
 Continue-Dev brings ChatGPT to VSCode. See how to [install it here](https://continue.dev/docs/quickstart).
 In the [config.py](https://continue.dev/docs/reference/Models/openai) set this as your default model.
 ```python
  default=OpenAI(
      api_key="IGNORED",
      model="fake-model-name",
      context_length=2048, # customize if needed for your model
      api_base="http://localhost:4000" # your proxy server url
  ),
 ```
 Credits [@vividfog](https://github.com/ollama/ollama/issues/305#issuecomment-1751848077) for this tutorial. 
 </TabItem>
 <TabItem value="aider" label="Aider">
 ```shell
 $ pip install aider 
 $ aider --openai-api-base http://0.0.0.0:4000 --openai-api-key fake-key
 ```
 </TabItem>
 <TabItem value="autogen" label="AutoGen">
 ```python
 pip install pyautogen
 ```
 ```python
 from autogen import AssistantAgent, UserProxyAgent, oai
 config_list=[
    {
        "model": "my-fake-model",
        "api_base": "http://localhost:4000",  #litellm compatible endpoint
        "api_type": "open_ai",
        "api_key": "NULL", # just a placeholder
    }
 ]
 response = oai.Completion.create(config_list=config_list, prompt="Hi")
 print(response) # works fine
 llm_config={
    "config_list": config_list,
 }
 assistant = AssistantAgent("assistant", llm_config=llm_config)
 user_proxy = UserProxyAgent("user_proxy")
 user_proxy.initiate_chat(assistant, message="Plot a chart of META and TESLA stock price change YTD.", config_list=config_list)
 ```
 Credits [@victordibia](https://github.com/microsoft/autogen/issues/45#issuecomment-1749921972) for this tutorial.
 </TabItem>
 <TabItem value="guidance" label="guidance">
 A guidance language for controlling large language models.
 https://github.com/guidance-ai/guidance
 **NOTE:** Guidance sends additional params like `stop_sequences` which can cause some models to fail if they don't support it. 
 **Fix**: Start your proxy using the `--drop_params` flag
 ```shell
 litellm --model ollama/codellama --temperature 0.3 --max_tokens 2048 --drop_params
 ```
 ```python
 import guidance
 # set api_base to your proxy
 # set api_key to anything
 gpt4 = guidance.llms.OpenAI("gpt-4", api_base="http://0.0.0.0:4000", api_key="anything")
 experts = guidance('''
 {{#system~}}
 You are a helpful and terse assistant.
 {{~/system}}
 {{#user~}}
 I want a response to the following question:
 {{query}}
 Name 3 world-class experts (past or present) who would be great at answering this?
 Don't answer the question yet.
 {{~/user}}
 {{#assistant~}}
 {{gen 'expert_names' temperature=0 max_tokens=300}}
 {{~/assistant}}
 ''', llm=gpt4)
 result = experts(query='How can I be more productive?')
 print(result)
 ```
 </TabItem>
 </Tabs>
 ## Advanced
 ### (BETA) Batch Completions - pass multiple models
--- a/docs/my-website/docs/proxy/users.md
+++ b/docs/my-website/docs/proxy/users.md
@ -484,6 +484,8 @@ You can set:
 - tpm limits (tokens per minute)
 - rpm limits (requests per minute)
 - max parallel requests
 - rpm / tpm limits per model for a given key
 <Tabs>
 <TabItem value="per-user" label="Per Internal User">
@ -532,6 +534,60 @@ curl --location 'http://0.0.0.0:4000/key/generate' \
 }
 ```
 </TabItem>
 <TabItem value="per-key-model" label="Per API Key Per model">
 **Set rate limits per model per api key**
 Set `model_rpm_limit` and `model_tpm_limit` to set rate limits per model per api key
 Here `gpt-4` is the `model_name` set on the [litellm config.yaml](configs.md)
 ```shell
 curl --location 'http://0.0.0.0:4000/key/generate' \
 --header 'Authorization: Bearer sk-1234' \
 --header 'Content-Type: application/json' \
 --data '{"model_rpm_limit": {"gpt-4": 2}, "model_tpm_limit": {"gpt-4":}}' 
 ```
 **Expected Response**
 ```json
 {
    "key": "sk-ulGNRXWtv7M0lFnnsQk0wQ",
    "expires": "2024-01-18T20:48:44.297973",
 }
 ```
 **Verify Model Rate Limits set correctly for this key**
 **Make /chat/completions request check if `x-litellm-key-remaining-requests-gpt-4` returned**
 ```shell
 curl -i http://localhost:4000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer sk-ulGNRXWtv7M0lFnnsQk0wQ" \
  -d '{
    "model": "gpt-4",
    "messages": [
      {"role": "user", "content": "Hello, Claude!ss eho ares"}
    ]
  }'
 ```
 **Expected headers**
 ```shell
 x-litellm-key-remaining-requests-gpt-4: 1
 x-litellm-key-remaining-tokens-gpt-4: 179
 ```
 These headers indicate:
 - 1 request remaining for the GPT-4 model for key=`sk-ulGNRXWtv7M0lFnnsQk0wQ`
 - 179 tokens remaining for the GPT-4 model for key=`sk-ulGNRXWtv7M0lFnnsQk0wQ`
 </TabItem>
 <TabItem value="per-end-user" label="For customers">
--- a/docs/my-website/docs/proxy/virtual_keys.md
+++ b/docs/my-website/docs/proxy/virtual_keys.md
@ -34,6 +34,7 @@ You can then generate keys by hitting the `/key/generate` endpoint.
 [**See code**](https://github.com/BerriAI/litellm/blob/7a669a36d2689c7f7890bc9c93e04ff3c2641299/litellm/proxy/proxy_server.py#L672)
 ## **Quick Start - Generate a Key**
 **Step 1: Save postgres db url**
 ```yaml
@ -65,7 +66,7 @@ curl 'http://0.0.0.0:4000/key/generate' \
 --data-raw '{"models": ["gpt-3.5-turbo", "gpt-4"], "metadata": {"user": "ishaan@berri.ai"}}'
 ```
-## Advanced - Spend Tracking 
+## Spend Tracking 
 Get spend per:
 - key - via `/key/info` [Swagger](https://litellm-api.up.railway.app/#/key%20management/info_key_fn_key_info_get)
@ -223,9 +224,70 @@ Expected Response
 </TabItem>
 </Tabs>
-## Advanced - Model Access
+## **Model Access**
-### Restrict models by `team_id`
+### **Restrict models by Virtual Key**
 Set allowed models for a key using the `models` param
 ```shell
 curl 'http://0.0.0.0:4000/key/generate' \
 --header 'Authorization: Bearer <your-master-key>' \
 --header 'Content-Type: application/json' \
 --data-raw '{"models": ["gpt-3.5-turbo", "gpt-4"]}'
 ```
 :::info
 This key can only make requests to `models` that are `gpt-3.5-turbo` or `gpt-4`
 :::
 Verify this is set correctly by 
 <Tabs>
 <TabItem label="Allowed Access" value = "allowed">
 ```shell
 curl -i http://localhost:4000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer sk-1234" \
  -d '{
    "model": "gpt-4",
    "messages": [
      {"role": "user", "content": "Hello"}
    ]
  }'
 ```
 </TabItem>
 <TabItem label="Disallowed Access" value = "not-allowed">
 :::info
 Expect this to fail since gpt-4o is not in the `models` for the key generated
 :::
 ```shell
 curl -i http://localhost:4000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer sk-1234" \
  -d '{
    "model": "gpt-4o",
    "messages": [
      {"role": "user", "content": "Hello"}
    ]
  }'
 ```
 </TabItem>
 </Tabs>
 ### **Restrict models by `team_id`**
 `litellm-dev` can only access `azure-gpt-3.5`
 **1. Create a team via `/team/new`**
@ -269,6 +331,157 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
 {"error":{"message":"Invalid model for team litellm-dev: BEDROCK_GROUP.  Valid models for team are: ['azure-gpt-3.5']\n\n\nTraceback (most recent call last):\n  File \"/Users/ishaanjaffer/Github/litellm/litellm/proxy/proxy_server.py\", line 2298, in chat_completion\n    _is_valid_team_configs(\n  File \"/Users/ishaanjaffer/Github/litellm/litellm/proxy/utils.py\", line 1296, in _is_valid_team_configs\n    raise Exception(\nException: Invalid model for team litellm-dev: BEDROCK_GROUP.  Valid models for team are: ['azure-gpt-3.5']\n\n","type":"None","param":"None","code":500}}%            
 ```         
 ### **Grant Access to new model (Access Groups)**
 Use model access groups to give users access to select models, and add new ones to it over time (e.g. mistral, llama-2, etc.)
 **Step 1. Assign model, access group in config.yaml**
 ```yaml
 model_list:
  - model_name: gpt-4
    litellm_params:
      model: openai/fake
      api_key: fake-key
      api_base: https://exampleopenaiendpoint-production.up.railway.app/
    model_info:
      access_groups: ["beta-models"] # 👈 Model Access Group
  - model_name: fireworks-llama-v3-70b-instruct
    litellm_params:
      model: fireworks_ai/accounts/fireworks/models/llama-v3-70b-instruct
      api_key: "os.environ/FIREWORKS"
    model_info:
      access_groups: ["beta-models"] # 👈 Model Access Group
 ```
 <Tabs>
 <TabItem value="key" label="Key Access Groups">
 **Create key with access group**
 ```bash
 curl --location 'http://localhost:4000/key/generate' \
 -H 'Authorization: Bearer <your-master-key>' \
 -H 'Content-Type: application/json' \
 -d '{"models": ["beta-models"], # 👈 Model Access Group
 			"max_budget": 0,}'
 ```
 Test Key 
 <Tabs>
 <TabItem label="Allowed Access" value = "allowed">
 ```shell
 curl -i http://localhost:4000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer sk-<key-from-previous-step>" \
  -d '{
    "model": "gpt-4",
    "messages": [
      {"role": "user", "content": "Hello"}
    ]
  }'
 ```
 </TabItem>
 <TabItem label="Disallowed Access" value = "not-allowed">
 :::info
 Expect this to fail since gpt-4o is not in the `beta-models` access group
 :::
 ```shell
 curl -i http://localhost:4000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer sk-<key-from-previous-step>" \
  -d '{
    "model": "gpt-4o",
    "messages": [
      {"role": "user", "content": "Hello"}
    ]
  }'
 ```
 </TabItem>
 </Tabs>
 </TabItem>
 <TabItem value="team" label="Team Access Groups">
 Create Team
 ```shell
 curl --location 'http://localhost:4000/team/new' \
 -H 'Authorization: Bearer sk-<key-from-previous-step>' \
 -H 'Content-Type: application/json' \
 -d '{"models": ["beta-models"]}'
 ```
 Create Key for Team 
 ```shell
 curl --location 'http://0.0.0.0:4000/key/generate' \
 --header 'Authorization: Bearer sk-<key-from-previous-step>' \
 --header 'Content-Type: application/json' \
 --data '{"team_id": "0ac97648-c194-4c90-8cd6-40af7b0d2d2a"}
 ```
 Test Key
 <Tabs>
 <TabItem label="Allowed Access" value = "allowed">
 ```shell
 curl -i http://localhost:4000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer sk-<key-from-previous-step>" \
  -d '{
    "model": "gpt-4",
    "messages": [
      {"role": "user", "content": "Hello"}
    ]
  }'
 ```
 </TabItem>
 <TabItem label="Disallowed Access" value = "not-allowed">
 :::info
 Expect this to fail since gpt-4o is not in the `beta-models` access group
 :::
 ```shell
 curl -i http://localhost:4000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer sk-<key-from-previous-step>" \
  -d '{
    "model": "gpt-4o",
    "messages": [
      {"role": "user", "content": "Hello"}
    ]
  }'
 ```
 </TabItem>
 </Tabs>
 </TabItem>
 </Tabs>
 ### Model Aliases
 If a user is expected to use a given model (i.e. gpt3-5), and you want to:
@ -319,35 +532,73 @@ curl -X POST "https://0.0.0.0:4000/key/generate" \
 - **How are routing between diff keys/api bases done?** litellm handles this by shuffling between different models in the model list with the same model_name. [**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/router.py)
-### Grant Access to new model 
+## Advanced
-Use model access groups to give users access to select models, and add new ones to it over time (e.g. mistral, llama-2, etc.)
+### Pass LiteLLM Key in custom header
-**Step 1. Assign model, access group in config.yaml**
+Use this to make LiteLLM proxy look for the virtual key in a custom header instead of the default `"Authorization"` header
 **Step 1** Define `litellm_key_header_name` name on litellm config.yaml
 ```yaml
 model_list:
-  - model_name: text-embedding-ada-002
+  - model_name: fake-openai-endpoint
    litellm_params:
-      model: azure/azure-embedding-model
+      model: openai/fake
-      api_base: "os.environ/AZURE_API_BASE"
+      api_key: fake-key
-      api_key: "os.environ/AZURE_API_KEY"
+      api_base: https://exampleopenaiendpoint-production.up.railway.app/
-      api_version: "2023-07-01-preview"
+
-    model_info:
+general_settings: 
-      access_groups: ["beta-models"] # 👈 Model Access Group
+  master_key: sk-1234 
  litellm_key_header_name: "X-Litellm-Key" # 👈 Key Change
 ```
-**Step 2. Create key with access group**
+**Step 2** Test it
-```bash
+In this request, litellm will use the Virtual key in the `X-Litellm-Key` header
-curl --location 'http://localhost:4000/key/generate' \
+
-H 'Authorization: Bearer <your-master-key>' \
+<Tabs>
-H 'Content-Type: application/json' \
+<TabItem value="curl" label="curl">
-d '{"models": ["beta-models"], # 👈 Model Access Group
+
-			"max_budget": 0,}'
+```shell
 curl http://localhost:4000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "X-Litellm-Key: Bearer sk-1234" \
  -H "Authorization: Bearer bad-key" \
  -d '{
    "model": "fake-openai-endpoint",
    "messages": [
      {"role": "user", "content": "Hello, Claude gm!"}
    ]
  }'
 ```
-## Advanced - Custom Auth 
+**Expected Response**
 Expect to see a successfull response from the litellm proxy since the key passed in `X-Litellm-Key` is valid
 ```shell
 {"id":"chatcmpl-f9b2b79a7c30477ab93cd0e717d1773e","choices":[{"finish_reason":"stop","index":0,"message":{"content":"\n\nHello there, how may I assist you today?","role":"assistant","tool_calls":null,"function_call":null}}],"created":1677652288,"model":"gpt-3.5-turbo-0125","object":"chat.completion","system_fingerprint":"fp_44709d6fcb","usage":{"completion_tokens":12,"prompt_tokens":9,"total_tokens":21}
 ```
 </TabItem>
 <TabItem value="python" label="OpenAI Python SDK">
 ```python
 client = openai.OpenAI(
    api_key="not-used",
    base_url="https://api-gateway-url.com/llmservc/api/litellmp",
    default_headers={
        "Authorization": f"Bearer {API_GATEWAY_TOKEN}", # (optional) For your API Gateway
        "X-Litellm-Key": f"Bearer sk-1234"              # For LiteLLM Proxy
    }
 )
 ```
 </TabItem>
 </Tabs>
 ### Custom Auth 
 You can now override the default api key auth.
@ -486,7 +737,7 @@ general_settings:
 ```
-## Upperbound /key/generate params
+### Upperbound /key/generate params
 Use this, if you need to set default upperbounds for `max_budget`, `budget_duration` or any `key/generate` param per key. 
 Set `litellm_settings:upperbound_key_generate_params`:
@ -502,7 +753,7 @@ litellm_settings:
 - Send a `/key/generate` request with `max_budget=200`
 - Key will be created with `max_budget=100` since 100 is the upper bound
-## Default /key/generate params
+### Default /key/generate params
 Use this, if you need to control the default `max_budget` or any `key/generate` param per key. 
 When a `/key/generate` request does not specify `max_budget`, it will use the `max_budget` specified in `default_key_generate_params`
@ -518,7 +769,11 @@ litellm_settings:
    team_id: "core-infra"
 ```
-## Endpoints
+## **Next Steps - Set Budgets, Rate Limits per Virtual Key**
 [Follow this doc to set budgets, rate limiters per virtual key with LiteLLM](users)
 ## Endpoint Reference (Spec)
 ### Keys 
--- a/docs/my-website/docs/proxy_server.md
+++ b/docs/my-website/docs/proxy_server.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
-# [OLD PROXY 👉 [NEW proxy here](./simple_proxy)] Local OpenAI Proxy Server
+# [OLD PROXY 👉 [NEW proxy here](./simple_proxy)] Local LiteLLM Proxy Server
 A fast, and lightweight OpenAI-compatible server to call 100+ LLM APIs. 
--- a/docs/my-website/docs/routing.md
+++ b/docs/my-website/docs/routing.md
@ -14,7 +14,7 @@ In production, litellm supports using Redis as a way to track cooldown server an
 :::info
-If you want a server to load balance across different LLM APIs, use our [OpenAI Proxy Server](./proxy/load_balancing.md)
+If you want a server to load balance across different LLM APIs, use our [LiteLLM Proxy Server](./proxy/load_balancing.md)
 :::
@ -88,8 +88,8 @@ print(response)
 ### Available Endpoints
 - `router.completion()` - chat completions endpoint to call 100+ LLMs
 - `router.acompletion()` - async chat completion calls
- `router.embeddings()` - embedding endpoint for Azure, OpenAI, Huggingface endpoints
+- `router.embedding()` - embedding endpoint for Azure, OpenAI, Huggingface endpoints
- `router.aembeddings()` - async embeddings calls
+- `router.aembedding()` - async embeddings calls
 - `router.text_completion()` - completion calls in the old OpenAI `/v1/completions` endpoint format
 - `router.atext_completion()` - async text completion calls
 - `router.image_generation()` - completion calls in OpenAI `/v1/images/generations` endpoint format
@ -1637,7 +1637,7 @@ response = router.completion(
 ## Deploy Router 
-If you want a server to load balance across different LLM APIs, use our [OpenAI Proxy Server](./simple_proxy#load-balancing---multiple-instances-of-1-model)
+If you want a server to load balance across different LLM APIs, use our [LiteLLM Proxy Server](./simple_proxy#load-balancing---multiple-instances-of-1-model)
 ## Init Params for the litellm.Router
--- a/docs/my-website/docs/scheduler.md
+++ b/docs/my-website/docs/scheduler.md
@ -41,7 +41,7 @@ router = Router(
 )
 try:
-    _response = await router.schedule_acompletion( # 👈 ADDS TO QUEUE + POLLS + MAKES CALL
+    _response = await router.acompletion( # 👈 ADDS TO QUEUE + POLLS + MAKES CALL
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": "Hey!"}],
        priority=0, # 👈 LOWER IS BETTER
@ -52,13 +52,13 @@ except Exception as e:
 ## LiteLLM Proxy
-To prioritize requests on LiteLLM Proxy call our beta openai-compatible `http://localhost:4000/queue` endpoint. 
+To prioritize requests on LiteLLM Proxy add `priority` to the request.
 <Tabs>
 <TabItem value="curl" label="curl">
 ```curl 
-curl -X POST 'http://localhost:4000/queue/chat/completions' \
+curl -X POST 'http://localhost:4000/chat/completions' \
 -H 'Content-Type: application/json' \
 -H 'Authorization: Bearer sk-1234' \
 -D '{
@ -128,7 +128,7 @@ router = Router(
 )
 try:
-    _response = await router.schedule_acompletion( # 👈 ADDS TO QUEUE + POLLS + MAKES CALL
+    _response = await router.acompletion( # 👈 ADDS TO QUEUE + POLLS + MAKES CALL
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": "Hey!"}],
        priority=0, # 👈 LOWER IS BETTER
@ -147,6 +147,9 @@ model_list:
        mock_response: "hello world!" 
        api_key: my-good-key
 litellm_settings:
    request_timeout: 600 # 👈 Will keep retrying until timeout occurs
 router_settings:
    redis_host; os.environ/REDIS_HOST
    redis_password: os.environ/REDIS_PASSWORD
--- a/docs/my-website/docs/sdk_custom_pricing.md
+++ b/docs/my-website/docs/sdk_custom_pricing.md
@ -0,0 +1,65 @@
 # Custom Pricing - SageMaker, Azure, etc
 Register custom pricing for sagemaker completion model. 
 For cost per second pricing, you **just** need to register `input_cost_per_second`. 
 ```python
 # !pip install boto3 
 from litellm import completion, completion_cost 
 os.environ["AWS_ACCESS_KEY_ID"] = ""
 os.environ["AWS_SECRET_ACCESS_KEY"] = ""
 os.environ["AWS_REGION_NAME"] = ""
 def test_completion_sagemaker():
    try:
        print("testing sagemaker")
        response = completion(
            model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4",
            messages=[{"role": "user", "content": "Hey, how's it going?"}],
            input_cost_per_second=0.000420,
        )
        # Add any assertions here to check the response
        print(response)
        cost = completion_cost(completion_response=response)
        print(cost)
    except Exception as e:
        raise Exception(f"Error occurred: {e}")
 ```
 ## Cost Per Token (e.g. Azure)
 ```python
 # !pip install boto3 
 from litellm import completion, completion_cost 
 ## set ENV variables
 os.environ["AZURE_API_KEY"] = ""
 os.environ["AZURE_API_BASE"] = ""
 os.environ["AZURE_API_VERSION"] = ""
 def test_completion_azure_model():
    try:
        print("testing azure custom pricing")
        # azure call
        response = completion(
          model = "azure/<your_deployment_name>", 
          messages = [{ "content": "Hello, how are you?","role": "user"}]
          input_cost_per_token=0.005,
          output_cost_per_token=1,
        )
        # Add any assertions here to check the response
        print(response)
        cost = completion_cost(completion_response=response)
        print(cost)
    except Exception as e:
        raise Exception(f"Error occurred: {e}")
 test_completion_azure_model()
 ```
--- a/Show more
+++ b/Show more
`@ -1,4 +1,4 @@`
	`# ✨ Attribute Management changes to Users`	`# Attribute Management changes to Users`

	`Call management endpoints on behalf of a user. (Useful when connecting proxy to your development platform).`	`Call management endpoints on behalf of a user. (Useful when connecting proxy to your development platform).`