[Bug Fix] Timestamp Granularities are not properly passed to whisper in Azure (#10299 )

* test fix form data parsing * test fix form data parsing * fix types
[Feat] Add GET, DELETE Responses endpoints on LiteLLM Proxy (#10297 )
2025-04-25 02:34:29 +00:00 · 2025-04-24 18:57:11 -07:00 · 2025-04-24 17:34:26 -07:00 · 2025-04-24 17:07:29 -07:00 · 2025-04-24 14:25:59 -07:00 · 2025-04-23 23:27:44 -07:00
692 changed files with 40463 additions and 10084 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -610,6 +610,8 @@ jobs:
          name: Install Dependencies
          command: |
            python -m pip install --upgrade pip
+            pip install wheel
+            pip install --upgrade pip wheel setuptools
            python -m pip install -r requirements.txt
            pip install "pytest==7.3.1"
            pip install "respx==0.21.1"
@ -1125,6 +1127,7 @@ jobs:
          name: Install Dependencies
          command: |
            python -m pip install --upgrade pip
+            python -m pip install wheel setuptools
            python -m pip install -r requirements.txt
            pip install "pytest==7.3.1"
            pip install "pytest-retry==1.6.3"
@ -2387,6 +2390,114 @@ jobs:
            echo "triggering load testing server for version ${VERSION} and commit ${CIRCLE_SHA1}"
            curl -X POST "https://proxyloadtester-production.up.railway.app/start/load/test?version=${VERSION}&commit_hash=${CIRCLE_SHA1}&release_type=nightly"

+  publish_proxy_extras:
+    docker:
+      - image: cimg/python:3.8
+    working_directory: ~/project/litellm-proxy-extras
+    environment:
+      TWINE_USERNAME: __token__
+
+    steps:
+      - checkout:
+          path: ~/project
+
+      - run:
+          name: Check if litellm-proxy-extras dir or pyproject.toml was modified
+          command: |
+            echo "Install TOML package."
+            python -m pip install toml
+            # Get current version from pyproject.toml
+            CURRENT_VERSION=$(python -c "import toml; print(toml.load('pyproject.toml')['tool']['poetry']['version'])")
+            
+            # Get last published version from PyPI
+            LAST_VERSION=$(curl -s https://pypi.org/pypi/litellm-proxy-extras/json | python -c "import json, sys; print(json.load(sys.stdin)['info']['version'])")
+            
+            echo "Current version: $CURRENT_VERSION"
+            echo "Last published version: $LAST_VERSION"
+            
+            # Compare versions using Python's packaging.version
+            VERSION_COMPARE=$(python -c "from packaging import version; print(1 if version.parse('$CURRENT_VERSION') < version.parse('$LAST_VERSION') else 0)")
+            
+            echo "Version compare: $VERSION_COMPARE"
+            if [ "$VERSION_COMPARE" = "1" ]; then
+              echo "Error: Current version ($CURRENT_VERSION) is less than last published version ($LAST_VERSION)"
+              exit 1
+            fi
+            
+            # If versions are equal or current is greater, check contents
+            pip download --no-deps litellm-proxy-extras==$LAST_VERSION -d /tmp
+            
+            echo "Contents of /tmp directory:"
+            ls -la /tmp
+            
+            # Find the downloaded file (could be .whl or .tar.gz)
+            DOWNLOADED_FILE=$(ls /tmp/litellm_proxy_extras-*)
+            echo "Downloaded file: $DOWNLOADED_FILE"
+            
+            # Extract based on file extension
+            if [[ "$DOWNLOADED_FILE" == *.whl ]]; then
+                echo "Extracting wheel file..."
+                unzip -q "$DOWNLOADED_FILE" -d /tmp/extracted
+                EXTRACTED_DIR="/tmp/extracted"
+            else
+                echo "Extracting tar.gz file..."
+                tar -xzf "$DOWNLOADED_FILE" -C /tmp
+                EXTRACTED_DIR="/tmp/litellm_proxy_extras-$LAST_VERSION"
+            fi
+            
+            echo "Contents of extracted package:"
+            ls -R "$EXTRACTED_DIR"
+            
+            # Compare contents
+            if ! diff -r "$EXTRACTED_DIR/litellm_proxy_extras" ./litellm_proxy_extras; then
+              if [ "$CURRENT_VERSION" = "$LAST_VERSION" ]; then
+                echo "Error: Changes detected in litellm-proxy-extras but version was not bumped"
+                echo "Current version: $CURRENT_VERSION"
+                echo "Last published version: $LAST_VERSION"
+                echo "Changes:"
+                diff -r "$EXTRACTED_DIR/litellm_proxy_extras" ./litellm_proxy_extras
+                exit 1
+              fi
+            else
+              echo "No changes detected in litellm-proxy-extras. Skipping PyPI publish."
+              circleci step halt
+            fi
+
+      - run:
+          name: Get new version
+          command: |
+            cd litellm-proxy-extras
+            NEW_VERSION=$(python -c "import toml; print(toml.load('pyproject.toml')['tool']['poetry']['version'])")
+            echo "export NEW_VERSION=$NEW_VERSION" >> $BASH_ENV
+
+      - run:
+          name: Check if versions match
+          command: |
+            cd ~/project
+            # Check pyproject.toml
+            CURRENT_VERSION=$(python -c "import toml; print(toml.load('pyproject.toml')['tool']['poetry']['dependencies']['litellm-proxy-extras'].split('\"')[1])")
+            if [ "$CURRENT_VERSION" != "$NEW_VERSION" ]; then
+              echo "Error: Version in pyproject.toml ($CURRENT_VERSION) doesn't match new version ($NEW_VERSION)"
+              exit 1
+            fi
+
+            # Check requirements.txt
+            REQ_VERSION=$(grep -oP 'litellm-proxy-extras==\K[0-9.]+' requirements.txt)
+            if [ "$REQ_VERSION" != "$NEW_VERSION" ]; then
+              echo "Error: Version in requirements.txt ($REQ_VERSION) doesn't match new version ($NEW_VERSION)"
+              exit 1
+            fi
+
+      - run:
+          name: Publish to PyPI
+          command: |
+            cd litellm-proxy-extras
+            echo -e "[pypi]\nusername = $PYPI_PUBLISH_USERNAME\npassword = $PYPI_PUBLISH_PASSWORD" > ~/.pypirc
+            python -m pip install --upgrade pip build twine setuptools wheel
+            rm -rf build dist
+            python -m build
+            twine upload --verbose dist/*
+
  e2e_ui_testing:
    machine:
      image: ubuntu-2204:2023.10.1
@ -2782,6 +2893,11 @@ workflows:
              only:
                - main
                - /litellm_.*/
+      - publish_proxy_extras:
+          filters:
+            branches:
+              only:
+                - main
      - publish_to_pypi:
          requires:
            - local_testing
@ -2816,7 +2932,5 @@ workflows:
            - proxy_build_from_pip_tests
            - proxy_pass_through_endpoint_tests
            - check_code_and_doc_quality
-          filters:
-            branches:
-              only:
-                - main
+            - publish_proxy_extras
+      
--- a/.env.example
+++ b/.env.example
@ -20,6 +20,8 @@ REPLICATE_API_TOKEN = ""
 ANTHROPIC_API_KEY = ""
 # Infisical
 INFISICAL_TOKEN = ""
+# INFINITY
+INFINITY_API_KEY = ""

 # Development Configs
 LITELLM_MASTER_KEY = "sk-1234"
--- a/.gitignore
+++ b/.gitignore
@ -73,6 +73,7 @@ tests/local_testing/log.txt
 .codegpt
 litellm/proxy/_new_new_secret_config.yaml
 litellm/proxy/custom_guardrail.py
+.mypy_cache/*
 litellm/proxy/_experimental/out/404.html
 litellm/proxy/_experimental/out/404.html
 litellm/proxy/_experimental/out/model_hub.html
@ -85,3 +86,5 @@ litellm/proxy/db/migrations/0_init/migration.sql
 litellm/proxy/db/migrations/*
 litellm/proxy/migrations/*config.yaml
 litellm/proxy/migrations/*
+config.yaml
+tests/litellm/litellm_core_utils/llm_cost_calc/log.txt
--- a/6
+++ b/6
@ -12,8 +12,7 @@ WORKDIR /app
 USER root

 # Install build dependencies
-RUN apk update && \
-    apk add --no-cache gcc python3-dev openssl openssl-dev
+RUN apk add --no-cache gcc python3-dev openssl openssl-dev


 RUN pip install --upgrade pip && \
@ -52,8 +51,7 @@ FROM $LITELLM_RUNTIME_IMAGE AS runtime
 USER root

 # Install runtime dependencies
-RUN apk update && \
-    apk add --no-cache openssl
+RUN apk add --no-cache openssl

 WORKDIR /app
 # Copy the current directory contents into the container at /app
--- a/cookbook/LiteLLM_HuggingFace.ipynb
+++ b/cookbook/LiteLLM_HuggingFace.ipynb
@ -7,7 +7,8 @@
      },
      "source": [
        "## LiteLLM Hugging Face\n",
-        "Docs for huggingface: https://docs.litellm.ai/docs/providers/huggingface"
+        "\n",
+        "Docs for huggingface: https://docs.litellm.ai/docs/providers/huggingface\n"
      ]
    },
    {
@ -27,23 +28,18 @@
        "id": "yp5UXRqtpu9f"
      },
      "source": [
-        "## Hugging Face Free Serverless Inference API\n",
-        "Read more about the Free Serverless Inference API here: https://huggingface.co/docs/api-inference.\n",
+        "## Serverless Inference Providers\n",
        "\n",
-        "In order to use litellm to call Serverless Inference API:\n",
+        "Read more about Inference Providers here: https://huggingface.co/blog/inference-providers.\n",
        "\n",
-        "* Browse Serverless Inference compatible models here: https://huggingface.co/models?inference=warm&pipeline_tag=text-generation.\n",
-        "* Copy the model name from hugging face\n",
-        "* Set `model = \"huggingface/<model-name>\"`\n",
+        "In order to use litellm with Hugging Face Inference Providers, you need to set `model=huggingface/<provider>/<model-id>`.\n",
        "\n",
-        "Example set `model=huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct` to call `meta-llama/Meta-Llama-3.1-8B-Instruct`\n",
-        "\n",
-        "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct"
+        "Example: `huggingface/together/deepseek-ai/DeepSeek-R1` to run DeepSeek-R1 (https://huggingface.co/deepseek-ai/DeepSeek-R1) through Together AI.\n"
      ]
    },
    {
      "cell_type": "code",
-      "execution_count": 3,
+      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
@ -51,107 +47,18 @@
        "id": "Pi5Oww8gpCUm",
        "outputId": "659a67c7-f90d-4c06-b94e-2c4aa92d897a"
      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "ModelResponse(id='chatcmpl-c54dfb68-1491-4d68-a4dc-35e603ea718a', choices=[Choices(finish_reason='eos_token', index=0, message=Message(content=\"I'm just a computer program, so I don't have feelings, but thank you for asking! How can I assist you today?\", role='assistant', tool_calls=None, function_call=None))], created=1724858285, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', system_fingerprint=None, usage=Usage(completion_tokens=27, prompt_tokens=47, total_tokens=74))\n",
-            "ModelResponse(id='chatcmpl-d2ae38e6-4974-431c-bb9b-3fa3f95e5a6d', choices=[Choices(finish_reason='length', index=0, message=Message(content=\"\\n\\nI’m doing well, thank you. I’ve been keeping busy with work and some personal projects. How about you?\\n\\nI'm doing well, thank you. I've been enjoying some time off and catching up on some reading. How can I assist you today?\\n\\nI'm looking for a good book to read. Do you have any recommendations?\\n\\nOf course! Here are a few book recommendations across different genres:\\n\\n1.\", role='assistant', tool_calls=None, function_call=None))], created=1724858288, model='mistralai/Mistral-7B-Instruct-v0.3', object='chat.completion', system_fingerprint=None, usage=Usage(completion_tokens=85, prompt_tokens=6, total_tokens=91))\n"
-          ]
-        }
-      ],
+      "outputs": [],
      "source": [
        "import os\n",
-        "import litellm\n",
+        "from litellm import completion\n",
        "\n",
-        "# Make sure to create an API_KEY with inference permissions at https://huggingface.co/settings/tokens/new?globalPermissions=inference.serverless.write&tokenType=fineGrained\n",
-        "os.environ[\"HUGGINGFACE_API_KEY\"] = \"\"\n",
+        "# You can create a HF token here: https://huggingface.co/settings/tokens\n",
+        "os.environ[\"HF_TOKEN\"] = \"hf_xxxxxx\"\n",
        "\n",
-        "# Call https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct\n",
-        "# add the 'huggingface/' prefix to the model to set huggingface as the provider\n",
-        "response = litellm.completion(\n",
-        "    model=\"huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
-        "    messages=[{ \"content\": \"Hello, how are you?\",\"role\": \"user\"}]\n",
-        ")\n",
-        "print(response)\n",
-        "\n",
-        "\n",
-        "# Call https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3\n",
-        "response = litellm.completion(\n",
-        "    model=\"huggingface/mistralai/Mistral-7B-Instruct-v0.3\",\n",
-        "    messages=[{ \"content\": \"Hello, how are you?\",\"role\": \"user\"}]\n",
-        ")\n",
-        "print(response)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "-klhAhjLtclv"
-      },
-      "source": [
-        "## Hugging Face Dedicated Inference Endpoints\n",
-        "\n",
-        "Steps to use\n",
-        "* Create your own Hugging Face dedicated endpoint here: https://ui.endpoints.huggingface.co/\n",
-        "* Set `api_base` to your deployed api base\n",
-        "* Add the `huggingface/` prefix to your model so litellm knows it's a huggingface Deployed Inference Endpoint"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 9,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "Lbmw8Gl_pHns",
-        "outputId": "ea8408bf-1cc3-4670-ecea-f12666d204a8"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "{\n",
-            "  \"object\": \"chat.completion\",\n",
-            "  \"choices\": [\n",
-            "    {\n",
-            "      \"finish_reason\": \"length\",\n",
-            "      \"index\": 0,\n",
-            "      \"message\": {\n",
-            "        \"content\": \"\\n\\nI am doing well, thank you for asking. How about you?\\nI am doing\",\n",
-            "        \"role\": \"assistant\",\n",
-            "        \"logprobs\": -8.9481967812\n",
-            "      }\n",
-            "    }\n",
-            "  ],\n",
-            "  \"id\": \"chatcmpl-74dc9d89-3916-47ce-9bea-b80e66660f77\",\n",
-            "  \"created\": 1695871068.8413374,\n",
-            "  \"model\": \"glaiveai/glaive-coder-7b\",\n",
-            "  \"usage\": {\n",
-            "    \"prompt_tokens\": 6,\n",
-            "    \"completion_tokens\": 18,\n",
-            "    \"total_tokens\": 24\n",
-            "  }\n",
-            "}\n"
-          ]
-        }
-      ],
-      "source": [
-        "import os\n",
-        "import litellm\n",
-        "\n",
-        "os.environ[\"HUGGINGFACE_API_KEY\"] = \"\"\n",
-        "\n",
-        "# TGI model: Call https://huggingface.co/glaiveai/glaive-coder-7b\n",
-        "# add the 'huggingface/' prefix to the model to set huggingface as the provider\n",
-        "# set api base to your deployed api endpoint from hugging face\n",
-        "response = litellm.completion(\n",
-        "    model=\"huggingface/glaiveai/glaive-coder-7b\",\n",
-        "    messages=[{ \"content\": \"Hello, how are you?\",\"role\": \"user\"}],\n",
-        "    api_base=\"https://wjiegasee9bmqke2.us-east-1.aws.endpoints.huggingface.cloud\"\n",
+        "# Call DeepSeek-R1 model through Together AI\n",
+        "response = completion(\n",
+        "    model=\"huggingface/together/deepseek-ai/DeepSeek-R1\",\n",
+        "    messages=[{\"content\": \"How many r's are in the word `strawberry`?\", \"role\": \"user\"}],\n",
        ")\n",
        "print(response)"
      ]
@ -162,13 +69,12 @@
        "id": "EU0UubrKzTFe"
      },
      "source": [
-        "## HuggingFace - Streaming (Serveless or Dedicated)\n",
-        "Set stream = True"
+        "## Streaming\n"
      ]
    },
    {
      "cell_type": "code",
-      "execution_count": 6,
+      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
@ -176,74 +82,147 @@
        "id": "y-QfIvA-uJKX",
        "outputId": "b007bb98-00d0-44a4-8264-c8a2caed6768"
      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "<litellm.utils.CustomStreamWrapper object at 0x1278471d0>\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content='I', role='assistant', function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=\"'m\", role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' just', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' a', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' computer', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' program', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=',', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' so', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' I', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' don', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=\"'t\", role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' have', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' feelings', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=',', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' but', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' thank', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' you', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' for', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' asking', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content='!', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' How', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' can', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' I', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' assist', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' you', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' today', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content='?', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content='<|eot_id|>', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n",
-            "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason='stop', index=0, delta=Delta(content=None, role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n"
-          ]
-        }
-      ],
+      "outputs": [],
      "source": [
        "import os\n",
-        "import litellm\n",
+        "from litellm import completion\n",
        "\n",
-        "# Make sure to create an API_KEY with inference permissions at https://huggingface.co/settings/tokens/new?globalPermissions=inference.serverless.write&tokenType=fineGrained\n",
-        "os.environ[\"HUGGINGFACE_API_KEY\"] = \"\"\n",
+        "os.environ[\"HF_TOKEN\"] = \"hf_xxxxxx\"\n",
        "\n",
-        "# Call https://huggingface.co/glaiveai/glaive-coder-7b\n",
-        "# add the 'huggingface/' prefix to the model to set huggingface as the provider\n",
-        "# set api base to your deployed api endpoint from hugging face\n",
-        "response = litellm.completion(\n",
-        "    model=\"huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
-        "    messages=[{ \"content\": \"Hello, how are you?\",\"role\": \"user\"}],\n",
-        "    stream=True\n",
+        "response = completion(\n",
+        "    model=\"huggingface/together/deepseek-ai/DeepSeek-R1\",\n",
+        "    messages=[\n",
+        "        {\n",
+        "            \"role\": \"user\",\n",
+        "            \"content\": \"How many r's are in the word `strawberry`?\",\n",
+        "            \n",
+        "        }\n",
+        "    ],\n",
+        "    stream=True,\n",
        ")\n",
        "\n",
-        "print(response)\n",
-        "\n",
        "for chunk in response:\n",
        "    print(chunk)"
      ]
    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## With images as input\n"
+      ]
+    },
    {
      "cell_type": "code",
      "execution_count": null,
-      "metadata": {
-        "id": "CKXAnK55zQRl"
-      },
+      "metadata": {},
      "outputs": [],
-      "source": []
+      "source": [
+        "from litellm import completion\n",
+        "\n",
+        "# Set your Hugging Face Token\n",
+        "os.environ[\"HF_TOKEN\"] = \"hf_xxxxxx\"\n",
+        "\n",
+        "messages = [\n",
+        "    {\n",
+        "        \"role\": \"user\",\n",
+        "        \"content\": [\n",
+        "            {\"type\": \"text\", \"text\": \"What's in this image?\"},\n",
+        "            {\n",
+        "                \"type\": \"image_url\",\n",
+        "                \"image_url\": {\n",
+        "                    \"url\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg\",\n",
+        "                },\n",
+        "            },\n",
+        "        ],\n",
+        "    }\n",
+        "]\n",
+        "\n",
+        "response = completion(\n",
+        "    model=\"huggingface/sambanova/meta-llama/Llama-3.3-70B-Instruct\",\n",
+        "    messages=messages,\n",
+        ")\n",
+        "print(response.choices[0])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Tools - Function Calling\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "from litellm import completion\n",
+        "\n",
+        "\n",
+        "# Set your Hugging Face Token\n",
+        "os.environ[\"HF_TOKEN\"] = \"hf_xxxxxx\"\n",
+        "\n",
+        "tools = [\n",
+        "    {\n",
+        "        \"type\": \"function\",\n",
+        "        \"function\": {\n",
+        "            \"name\": \"get_current_weather\",\n",
+        "            \"description\": \"Get the current weather in a given location\",\n",
+        "            \"parameters\": {\n",
+        "                \"type\": \"object\",\n",
+        "                \"properties\": {\n",
+        "                    \"location\": {\n",
+        "                        \"type\": \"string\",\n",
+        "                        \"description\": \"The city and state, e.g. San Francisco, CA\",\n",
+        "                    },\n",
+        "                    \"unit\": {\"type\": \"string\", \"enum\": [\"celsius\", \"fahrenheit\"]},\n",
+        "                },\n",
+        "                \"required\": [\"location\"],\n",
+        "            },\n",
+        "        },\n",
+        "    }\n",
+        "]\n",
+        "messages = [{\"role\": \"user\", \"content\": \"What's the weather like in Boston today?\"}]\n",
+        "\n",
+        "response = completion(\n",
+        "    model=\"huggingface/sambanova/meta-llama/Llama-3.1-8B-Instruct\", messages=messages, tools=tools, tool_choice=\"auto\"\n",
+        ")\n",
+        "print(response)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Hugging Face Dedicated Inference Endpoints\n",
+        "\n",
+        "Steps to use\n",
+        "\n",
+        "- Create your own Hugging Face dedicated endpoint here: https://ui.endpoints.huggingface.co/\n",
+        "- Set `api_base` to your deployed api base\n",
+        "- set the model to `huggingface/tgi` so that litellm knows it's a huggingface Deployed Inference Endpoint.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import litellm\n",
+        "\n",
+        "\n",
+        "response = litellm.completion(\n",
+        "    model=\"huggingface/tgi\",\n",
+        "    messages=[{\"content\": \"Hello, how are you?\", \"role\": \"user\"}],\n",
+        "    api_base=\"https://my-endpoint.endpoints.huggingface.cloud/v1/\",\n",
+        ")\n",
+        "print(response)"
+      ]
    }
  ],
  "metadata": {
@ -251,7 +230,8 @@
      "provenance": []
    },
    "kernelspec": {
-      "display_name": "Python 3",
+      "display_name": ".venv",
+      "language": "python",
      "name": "python3"
    },
    "language_info": {
@ -264,7 +244,7 @@
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
-      "version": "3.12.2"
+      "version": "3.12.0"
    }
  },
  "nbformat": 4,
--- a/deploy/charts/litellm-helm/Chart.yaml
+++ b/deploy/charts/litellm-helm/Chart.yaml
@ -18,7 +18,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.4.2
+version: 0.4.3

 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
--- a/deploy/charts/litellm-helm/templates/deployment.yaml
+++ b/deploy/charts/litellm-helm/templates/deployment.yaml
@ -97,6 +97,9 @@ spec:
              value: {{ $val | quote }}
            {{- end }}
            {{- end }}
+            {{- with .Values.extraEnvVars }}
+              {{- toYaml . | nindent 12 }}
+            {{- end }}
          envFrom:
          {{- range .Values.environmentSecrets }}
            - secretRef:
--- a/deploy/charts/litellm-helm/templates/migrations-job.yaml
+++ b/deploy/charts/litellm-helm/templates/migrations-job.yaml
@ -16,6 +16,7 @@ spec:
        {{- toYaml . | nindent 8 }}
        {{- end }}
    spec:
+      serviceAccountName: {{ include "litellm.serviceAccountName" . }}
      containers:
        - name: prisma-migrations
          image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default (printf "main-%s" .Chart.AppVersion) }}"
--- a/deploy/charts/litellm-helm/templates/service.yaml
+++ b/deploy/charts/litellm-helm/templates/service.yaml
@ -2,6 +2,10 @@ apiVersion: v1
 kind: Service
 metadata:
  name: {{ include "litellm.fullname" . }}
+  {{- with .Values.service.annotations }}
+  annotations:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
  labels:
    {{- include "litellm.labels" . | nindent 4 }}
 spec:
--- a/deploy/charts/litellm-helm/tests/deployment_tests.yaml
+++ b/deploy/charts/litellm-helm/tests/deployment_tests.yaml
@ -80,3 +80,38 @@ tests:
              secretKeyRef:
                name: my-secret
                key: my-key
+  - it: should work with extraEnvVars
+    template: deployment.yaml
+    set:
+      extraEnvVars:
+        - name: EXTRA_ENV_VAR
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.labels['env']
+    asserts:
+      - contains:
+          path: spec.template.spec.containers[0].env
+          content:
+            name: EXTRA_ENV_VAR
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.labels['env']
+  - it: should work with both extraEnvVars and envVars
+    template: deployment.yaml
+    set:
+      envVars:
+        ENV_VAR: ENV_VAR_VALUE
+      extraEnvVars:
+        - name: EXTRA_ENV_VAR
+          value: EXTRA_ENV_VAR_VALUE
+    asserts:
+      - contains:
+          path: spec.template.spec.containers[0].env
+          content:
+            name: ENV_VAR
+            value: ENV_VAR_VALUE
+      - contains:
+          path: spec.template.spec.containers[0].env
+          content:
+            name: EXTRA_ENV_VAR
+            value: EXTRA_ENV_VAR_VALUE
--- a/deploy/charts/litellm-helm/values.yaml
+++ b/deploy/charts/litellm-helm/values.yaml
@ -195,9 +195,15 @@ migrationJob:
  annotations: {}
  ttlSecondsAfterFinished: 120

-# Additional environment variables to be added to the deployment
+# Additional environment variables to be added to the deployment as a map of key-value pairs
 envVars: {
    # USE_DDTRACE: "true"
 }

+# Additional environment variables to be added to the deployment as a list of k8s env vars
+extraEnvVars: {
+    # - name: EXTRA_ENV_VAR
+    #   value: EXTRA_ENV_VAR_VALUE
+}
+

--- a/docker/Dockerfile.alpine
+++ b/docker/Dockerfile.alpine
@ -35,7 +35,7 @@ RUN pip wheel --no-cache-dir --wheel-dir=/wheels/ -r requirements.txt
 FROM $LITELLM_RUNTIME_IMAGE AS runtime

 # Update dependencies and clean up
-RUN apk update && apk upgrade && rm -rf /var/cache/apk/*
+RUN apk upgrade --no-cache

 WORKDIR /app

--- a/docker/Dockerfile.database
+++ b/docker/Dockerfile.database
@ -12,8 +12,7 @@ WORKDIR /app
 USER root

 # Install build dependencies
-RUN apk update && \
-    apk add --no-cache gcc python3-dev openssl openssl-dev
+RUN apk add --no-cache gcc python3-dev openssl openssl-dev


 RUN pip install --upgrade pip && \
@ -44,8 +43,7 @@ FROM $LITELLM_RUNTIME_IMAGE AS runtime
 USER root

 # Install runtime dependencies
-RUN apk update && \
-    apk add --no-cache openssl
+RUN apk add --no-cache openssl

 WORKDIR /app
 # Copy the current directory contents into the container at /app
--- a/docs/my-website/docs/anthropic_unified.md
+++ b/docs/my-website/docs/anthropic_unified.md
@ -109,7 +109,7 @@ client = anthropic.Anthropic(

 response = client.messages.create(
    messages=[{"role": "user", "content": "Hello, can you tell me a short joke?"}],
-    model="anthropic/claude-3-haiku-20240307",
+    model="anthropic-claude",
    max_tokens=100,
 )
 ```
--- a/docs/my-website/docs/completion/audio.md
+++ b/docs/my-website/docs/completion/audio.md
@ -3,7 +3,7 @@ import TabItem from '@theme/TabItem';

 # Using Audio Models

-How to send / receieve audio to a `/chat/completions` endpoint
+How to send / receive audio to a `/chat/completions` endpoint


 ## Audio Output from a model
--- a/docs/my-website/docs/completion/document_understanding.md
+++ b/docs/my-website/docs/completion/document_understanding.md
@ -3,7 +3,7 @@ import TabItem from '@theme/TabItem';

 # Using PDF Input

-How to send / receieve pdf's (other document types) to a `/chat/completions` endpoint
+How to send / receive pdf's (other document types) to a `/chat/completions` endpoint

 Works for:
 - Vertex AI models (Gemini + Anthropic)
@ -27,16 +27,18 @@ os.environ["AWS_REGION_NAME"] = ""


 # pdf url
-image_url = "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"
+file_url = "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"

 # model
 model = "bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0"

-image_content = [
+file_content = [
    {"type": "text", "text": "What's this file about?"},
    {
-        "type": "image_url",
-        "image_url": image_url, # OR {"url": image_url}
+        "type": "file",
+        "file": {
+            "file_id": file_url,
+        }
    },
 ]

@ -46,7 +48,7 @@ if not supports_pdf_input(model, None):

 response = completion(
    model=model,
-    messages=[{"role": "user", "content": image_content}],
+    messages=[{"role": "user", "content": file_content}],
 )
 assert response is not None
 ```
@ -83,8 +85,10 @@ curl -X POST 'http://0.0.0.0:4000/chat/completions' \
        {"role": "user", "content": [
            {"type": "text", "text": "What's this file about?"},
            {
-                "type": "image_url",
-                "image_url": "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
+                "type": "file",
+                "file": {
+                    "file_id": "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
+                }
            }
        ]},
    ]
@ -118,11 +122,13 @@ base64_url = f"data:application/pdf;base64,{encoded_file}"
 # model
 model = "bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0"

-image_content = [
+file_content = [
    {"type": "text", "text": "What's this file about?"},
    {
-        "type": "image_url",
-        "image_url": base64_url, # OR {"url": base64_url}
+        "type": "file",
+        "file": {
+            "file_data": base64_url,
+        }
    },
 ]

@ -132,7 +138,7 @@ if not supports_pdf_input(model, None):

 response = completion(
    model=model,
-    messages=[{"role": "user", "content": image_content}],
+    messages=[{"role": "user", "content": file_content}],
 )
 assert response is not None
 ```
@ -169,8 +175,10 @@ curl -X POST 'http://0.0.0.0:4000/chat/completions' \
        {"role": "user", "content": [
            {"type": "text", "text": "What's this file about?"},
            {
-                "type": "image_url",
-                "image_url": "data:application/pdf;base64...",
+                "type": "file",
+                "file": {
+                    "file_data": "data:application/pdf;base64...",
+                }
            }
        ]},
    ]
@ -242,92 +250,3 @@ Expected Response

 </TabItem>
 </Tabs>
-
-
-## OpenAI 'file' message type
-
-This is currently only supported for OpenAI models. 
-
-This will be supported for all providers soon. 
-
-<Tabs>
-<TabItem value="sdk" label="SDK">
-
-```python
-import base64
-from litellm import completion
-
-with open("draconomicon.pdf", "rb") as f:
-    data = f.read()
-
-base64_string = base64.b64encode(data).decode("utf-8")
-
-completion = completion(
-    model="gpt-4o",
-    messages=[
-        {
-            "role": "user",
-            "content": [
-                {
-                    "type": "file",
-                    "file": {
-                        "filename": "draconomicon.pdf",
-                        "file_data": f"data:application/pdf;base64,{base64_string}",
-                    }
-                },
-                {
-                    "type": "text",
-                    "text": "What is the first dragon in the book?",
-                }
-            ],
-        },
-    ],
-)
-
-print(completion.choices[0].message.content)
-```
-
-</TabItem>
-
-<TabItem value="proxy" label="PROXY">
-
-1. Setup config.yaml
-
-```yaml
-model_list:
-  - model_name: openai-model
-    litellm_params:
-      model: gpt-4o
-      api_key: os.environ/OPENAI_API_KEY
-```
-
-2. Start the proxy
-
-```bash
-litellm --config config.yaml
-```
-
-3. Test it!
-
-```bash
-curl -X POST 'http://0.0.0.0:4000/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{ 
-    "model": "openai-model",
-    "messages": [
-        {"role": "user", "content": [
-            {
-                "type": "file",
-                "file": {
-                    "filename": "draconomicon.pdf",
-                    "file_data": f"data:application/pdf;base64,{base64_string}",
-                }
-            }
-        ]}
-    ]
-}'
-```
-
-</TabItem>
-</Tabs>
--- a/docs/my-website/docs/completion/vision.md
+++ b/docs/my-website/docs/completion/vision.md
@ -194,7 +194,7 @@ Expected Response

 ## Explicitly specify image type 

-If you have images without a mime-type, or if litellm is incorrectly inferring the mime type of your image (e.g. calling `gs://` url's with vertex ai), you can set this explicity via the `format` param. 
+If you have images without a mime-type, or if litellm is incorrectly inferring the mime type of your image (e.g. calling `gs://` url's with vertex ai), you can set this explicitly via the `format` param. 

 ```python
 "image_url": {
--- a/docs/my-website/docs/files_endpoints.md
+++ b/docs/my-website/docs/files_endpoints.md
@ -2,10 +2,12 @@
 import TabItem from '@theme/TabItem';
 import Tabs from '@theme/Tabs';

-# /files
+# Provider Files Endpoints

 Files are used to upload documents that can be used with features like Assistants, Fine-tuning, and Batch API.

+Use this to call the provider's `/files` endpoints directly, in the OpenAI format. 
+
 ## Quick Start

 - Upload a File
@ -19,7 +21,7 @@ Files are used to upload documents that can be used with features like Assistant
 <Tabs>
 <TabItem value="proxy" label="LiteLLM PROXY Server">

-### 1. Setup config.yaml
+1. Setup config.yaml

 ```
 # for /files endpoints
@ -32,7 +34,7 @@ files_settings:
    api_key: os.environ/OPENAI_API_KEY
 ```

-### 2. Start LiteLLM PROXY Server
+2. Start LiteLLM PROXY Server

 ```bash
 litellm --config /path/to/config.yaml
@ -40,7 +42,7 @@ litellm --config /path/to/config.yaml
 ## RUNNING on http://0.0.0.0:4000
 ```

-### 3. Use OpenAI's /files endpoints
+3. Use OpenAI's /files endpoints

 Upload a File

--- a/docs/my-website/docs/image_generation.md
+++ b/docs/my-website/docs/image_generation.md
@ -20,9 +20,9 @@ print(f"response: {response}")

 ```yaml
 model_list:
-  - model_name: dall-e-2 ### RECEIVED MODEL NAME ###
+  - model_name: gpt-image-1 ### RECEIVED MODEL NAME ###
    litellm_params: # all params accepted by litellm.image_generation()
-      model: azure/dall-e-2 ### MODEL NAME sent to `litellm.image_generation()` ###
+      model: azure/gpt-image-1 ### MODEL NAME sent to `litellm.image_generation()` ###
      api_base: https://my-endpoint-europe-berri-992.openai.azure.com/
      api_key: "os.environ/AZURE_API_KEY_EU" # does os.getenv("AZURE_API_KEY_EU")
      rpm: 6      # [OPTIONAL] Rate limit for this deployment: in requests per minute (rpm)
@ -47,7 +47,7 @@ curl -X POST 'http://0.0.0.0:4000/v1/images/generations' \
 -H 'Content-Type: application/json' \
 -H 'Authorization: Bearer sk-1234' \
 -D '{
-    "model": "dall-e-2",
+    "model": "gpt-image-1",
    "prompt": "A cute baby sea otter",
    "n": 1,
    "size": "1024x1024"
@ -104,7 +104,7 @@ Any non-openai params, will be treated as provider-specific params, and sent in
    litellm_logging_obj=None,
    custom_llm_provider=None,

- `model`: *string (optional)* The model to use for image generation. Defaults to openai/dall-e-2
+- `model`: *string (optional)* The model to use for image generation. Defaults to openai/gpt-image-1

 - `n`: *int (optional)* The number of images to generate. Must be between 1 and 10. For dall-e-3, only n=1 is supported.

@ -112,7 +112,7 @@ Any non-openai params, will be treated as provider-specific params, and sent in

 - `response_format`: *string (optional)* The format in which the generated images are returned. Must be one of url or b64_json.

- `size`: *string (optional)* The size of the generated images. Must be one of 256x256, 512x512, or 1024x1024 for dall-e-2. Must be one of 1024x1024, 1792x1024, or 1024x1792 for dall-e-3 models.
+- `size`: *string (optional)* The size of the generated images. Must be one of 256x256, 512x512, or 1024x1024 for gpt-image-1. Must be one of 1024x1024, 1792x1024, or 1024x1792 for dall-e-3 models.

 - `timeout`: *integer* - The maximum time, in seconds, to wait for the API to respond. Defaults to 600 seconds (10 minutes).

@ -148,13 +148,14 @@ Any non-openai params, will be treated as provider-specific params, and sent in
 from litellm import image_generation
 import os
 os.environ['OPENAI_API_KEY'] = ""
-response = image_generation(model='dall-e-2', prompt="cute baby otter")
+response = image_generation(model='gpt-image-1', prompt="cute baby otter")
 ```

 | Model Name           | Function Call                               | Required OS Variables                |
 |----------------------|---------------------------------------------|--------------------------------------|
-| dall-e-2 | `image_generation(model='dall-e-2', prompt="cute baby otter")` | `os.environ['OPENAI_API_KEY']`       |
+| gpt-image-1 | `image_generation(model='gpt-image-1', prompt="cute baby otter")` | `os.environ['OPENAI_API_KEY']`       |
 | dall-e-3 | `image_generation(model='dall-e-3', prompt="cute baby otter")` | `os.environ['OPENAI_API_KEY']`       |
+| dall-e-2 | `image_generation(model='dall-e-2', prompt="cute baby otter")` | `os.environ['OPENAI_API_KEY']`       |

 ## Azure OpenAI Image Generation Models

@ -182,8 +183,9 @@ print(response)

 | Model Name           | Function Call                               |
 |----------------------|---------------------------------------------|
-| dall-e-2 | `image_generation(model="azure/<your deployment name>", prompt="cute baby otter")` |
+| gpt-image-1 | `image_generation(model="azure/<your deployment name>", prompt="cute baby otter")` |
 | dall-e-3 | `image_generation(model="azure/<your deployment name>", prompt="cute baby otter")` |
+| dall-e-2 | `image_generation(model="azure/<your deployment name>", prompt="cute baby otter")` |


 ## OpenAI Compatible Image Generation Models
--- a/docs/my-website/docs/observability/agentops_integration.md
+++ b/docs/my-website/docs/observability/agentops_integration.md
@ -0,0 +1,83 @@
+# 🖇️ AgentOps - LLM Observability Platform
+
+:::tip
+
+This is community maintained. Please make an issue if you run into a bug:
+https://github.com/BerriAI/litellm
+
+:::
+
+[AgentOps](https://docs.agentops.ai) is an observability platform that enables tracing and monitoring of LLM calls, providing detailed insights into your AI operations.
+
+## Using AgentOps with LiteLLM
+
+LiteLLM provides `success_callbacks` and `failure_callbacks`, allowing you to easily integrate AgentOps for comprehensive tracing and monitoring of your LLM operations.
+
+### Integration
+
+Use just a few lines of code to instantly trace your responses **across all providers** with AgentOps:
+Get your AgentOps API Keys from https://app.agentops.ai/
+```python
+import litellm
+
+# Configure LiteLLM to use AgentOps
+litellm.success_callback = ["agentops"]
+
+# Make your LLM calls as usual
+response = litellm.completion(
+    model="gpt-3.5-turbo",
+    messages=[{"role": "user", "content": "Hello, how are you?"}],
+)
+```
+
+Complete Code:
+
+```python
+import os
+from litellm import completion
+
+# Set env variables
+os.environ["OPENAI_API_KEY"] = "your-openai-key"
+os.environ["AGENTOPS_API_KEY"] = "your-agentops-api-key"
+
+# Configure LiteLLM to use AgentOps
+litellm.success_callback = ["agentops"]
+
+# OpenAI call
+response = completion(
+    model="gpt-4",
+    messages=[{"role": "user", "content": "Hi 👋 - I'm OpenAI"}],
+)
+
+print(response)
+```
+
+### Configuration Options
+
+The AgentOps integration can be configured through environment variables:
+
+- `AGENTOPS_API_KEY` (str, optional): Your AgentOps API key
+- `AGENTOPS_ENVIRONMENT` (str, optional): Deployment environment (defaults to "production")
+- `AGENTOPS_SERVICE_NAME` (str, optional): Service name for tracing (defaults to "agentops")
+
+### Advanced Usage
+
+You can configure additional settings through environment variables:
+
+```python
+import os
+
+# Configure AgentOps settings
+os.environ["AGENTOPS_API_KEY"] = "your-agentops-api-key"
+os.environ["AGENTOPS_ENVIRONMENT"] = "staging"
+os.environ["AGENTOPS_SERVICE_NAME"] = "my-service"
+
+# Enable AgentOps tracing
+litellm.success_callback = ["agentops"]
+```
+
+### Support
+
+For issues or questions, please refer to:
+- [AgentOps Documentation](https://docs.agentops.ai)
+- [LiteLLM Documentation](https://docs.litellm.ai) 
--- a/docs/my-website/docs/observability/greenscale_integration.md
+++ b/docs/my-website/docs/observability/greenscale_integration.md
@ -53,7 +53,7 @@ response = completion(

 ## Additional information in metadata

-You can send any additional information to Greenscale by using the `metadata` field in completion and `greenscale_` prefix. This can be useful for sending metadata about the request, such as the project and application name, customer_id, enviornment, or any other information you want to track usage. `greenscale_project` and `greenscale_application` are required fields.
+You can send any additional information to Greenscale by using the `metadata` field in completion and `greenscale_` prefix. This can be useful for sending metadata about the request, such as the project and application name, customer_id, environment, or any other information you want to track usage. `greenscale_project` and `greenscale_application` are required fields.

 ```python
 #openai call with additional metadata
--- a/docs/my-website/docs/observability/langfuse_integration.md
+++ b/docs/my-website/docs/observability/langfuse_integration.md
@ -185,7 +185,7 @@ curl --location --request POST 'http://0.0.0.0:4000/chat/completions' \
 * `trace_release`  - Release for the trace, defaults to `None`
 * `trace_metadata` - Metadata for the trace, defaults to `None`
 * `trace_user_id`  - User identifier for the trace, defaults to completion argument `user`
-* `tags`           - Tags for the trace, defeaults to `None`
+* `tags`           - Tags for the trace, defaults to `None`

 ##### Updatable Parameters on Continuation

--- a/docs/my-website/docs/pass_through/cohere.md
+++ b/docs/my-website/docs/pass_through/cohere.md
@ -4,7 +4,7 @@ Pass-through endpoints for Cohere - call provider-specific endpoint, in native f

 | Feature | Supported | Notes | 
 |-------|-------|-------|
-| Cost Tracking | ❌ | [Tell us if you need this](https://github.com/BerriAI/litellm/issues/new) |
+| Cost Tracking | ✅ | Supported for `/v1/chat`, and `/v2/chat` |
 | Logging | ✅ | works across all integrations |
 | End-user Tracking | ❌ | [Tell us if you need this](https://github.com/BerriAI/litellm/issues/new) |
 | Streaming | ✅ | |
--- a/docs/my-website/docs/pass_through/mistral.md
+++ b/docs/my-website/docs/pass_through/mistral.md
@ -0,0 +1,217 @@
+# Mistral
+
+Pass-through endpoints for Mistral - call provider-specific endpoint, in native format (no translation).
+
+| Feature | Supported | Notes | 
+|-------|-------|-------|
+| Cost Tracking | ❌ | Not supported |
+| Logging | ✅ | works across all integrations |
+| End-user Tracking | ❌ | [Tell us if you need this](https://github.com/BerriAI/litellm/issues/new) |
+| Streaming | ✅ | |
+
+Just replace `https://api.mistral.ai/v1` with `LITELLM_PROXY_BASE_URL/mistral` 🚀
+
+#### **Example Usage**
+
+```bash
+curl -L -X POST 'http://0.0.0.0:4000/mistral/v1/ocr' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+    "model": "mistral-ocr-latest",
+    "document": {
+        "type": "image_url",
+        "image_url": "https://raw.githubusercontent.com/mistralai/cookbook/refs/heads/main/mistral/ocr/receipt.png"
+    }
+
+}'
+```
+
+Supports **ALL** Mistral Endpoints (including streaming).
+
+## Quick Start
+
+Let's call the Mistral [`/chat/completions` endpoint](https://docs.mistral.ai/api/#tag/chat/operation/chat_completion_v1_chat_completions_post)
+
+1. Add MISTRAL_API_KEY to your environment 
+
+```bash
+export MISTRAL_API_KEY="sk-1234"
+```
+
+2. Start LiteLLM Proxy 
+
+```bash
+litellm
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+3. Test it! 
+
+Let's call the Mistral `/ocr` endpoint
+
+```bash
+curl -L -X POST 'http://0.0.0.0:4000/mistral/v1/ocr' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+    "model": "mistral-ocr-latest",
+    "document": {
+        "type": "image_url",
+        "image_url": "https://raw.githubusercontent.com/mistralai/cookbook/refs/heads/main/mistral/ocr/receipt.png"
+    }
+
+}'
+```
+
+
+## Examples
+
+Anything after `http://0.0.0.0:4000/mistral` is treated as a provider-specific route, and handled accordingly.
+
+Key Changes: 
+
+| **Original Endpoint**                                | **Replace With**                  |
+|------------------------------------------------------|-----------------------------------|
+| `https://api.mistral.ai/v1`          | `http://0.0.0.0:4000/mistral` (LITELLM_PROXY_BASE_URL="http://0.0.0.0:4000")      |
+| `bearer $MISTRAL_API_KEY`                                 | `bearer anything` (use `bearer LITELLM_VIRTUAL_KEY` if Virtual Keys are setup on proxy)                    |
+
+
+### **Example 1: OCR endpoint**
+
+#### LiteLLM Proxy Call 
+
+```bash
+curl -L -X POST 'http://0.0.0.0:4000/mistral/v1/ocr' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer $LITELLM_API_KEY' \
+-d '{
+    "model": "mistral-ocr-latest",
+    "document": {
+        "type": "image_url",
+        "image_url": "https://raw.githubusercontent.com/mistralai/cookbook/refs/heads/main/mistral/ocr/receipt.png"
+    }
+}'
+```
+
+
+#### Direct Mistral API Call 
+
+```bash
+curl https://api.mistral.ai/v1/ocr \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer ${MISTRAL_API_KEY}" \
+  -d '{
+    "model": "mistral-ocr-latest",
+    "document": {
+        "type": "document_url",
+        "document_url": "https://arxiv.org/pdf/2201.04234"
+    },
+    "include_image_base64": true
+  }'
+```
+
+### **Example 2: Chat API**
+
+#### LiteLLM Proxy Call 
+
+```bash
+curl -L -X POST 'http://0.0.0.0:4000/mistral/v1/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer $LITELLM_VIRTUAL_KEY' \
+-d '{
+    "messages": [
+        {
+            "role": "user",
+            "content": "I am going to Paris, what should I see?"
+        }
+    ],
+    "max_tokens": 2048,
+    "temperature": 0.8,
+    "top_p": 0.1,
+    "model": "mistral-large-latest",
+}'
+```
+
+#### Direct Mistral API Call 
+
+```bash
+curl -L -X POST 'https://api.mistral.ai/v1/chat/completions' \
+-H 'Content-Type: application/json' \
+-d '{
+    "messages": [
+        {
+            "role": "user",
+            "content": "I am going to Paris, what should I see?"
+        }
+    ],
+    "max_tokens": 2048,
+    "temperature": 0.8,
+    "top_p": 0.1,
+    "model": "mistral-large-latest",
+}'
+```
+
+
+## Advanced - Use with Virtual Keys 
+
+Pre-requisites
+- [Setup proxy with DB](../proxy/virtual_keys.md#setup)
+
+Use this, to avoid giving developers the raw Mistral API key, but still letting them use Mistral endpoints.
+
+### Usage
+
+1. Setup environment
+
+```bash
+export DATABASE_URL=""
+export LITELLM_MASTER_KEY=""
+export MISTRAL_API_BASE=""
+```
+
+```bash
+litellm
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+2. Generate virtual key 
+
+```bash
+curl -X POST 'http://0.0.0.0:4000/key/generate' \
+-H 'Authorization: Bearer sk-1234' \
+-H 'Content-Type: application/json' \
+-d '{}'
+```
+
+Expected Response 
+
+```bash
+{
+    ...
+    "key": "sk-1234ewknldferwedojwojw"
+}
+```
+
+3. Test it! 
+
+
+```bash
+curl -L -X POST 'http://0.0.0.0:4000/mistral/v1/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234ewknldferwedojwojw' \
+  --data '{
+    "messages": [
+        {
+            "role": "user",
+            "content": "I am going to Paris, what should I see?"
+        }
+    ],
+    "max_tokens": 2048,
+    "temperature": 0.8,
+    "top_p": 0.1,
+    "model": "qwen2.5-7b-instruct",
+}'
+```
--- a/docs/my-website/docs/pass_through/vertex_ai.md
+++ b/docs/my-website/docs/pass_through/vertex_ai.md
@ -13,6 +13,15 @@ Pass-through endpoints for Vertex AI - call provider-specific endpoint, in nativ
 | End-user Tracking | ❌ | [Tell us if you need this](https://github.com/BerriAI/litellm/issues/new) |
 | Streaming | ✅ | |

+## Supported Endpoints
+
+LiteLLM supports 2 vertex ai passthrough routes:
+
+1. `/vertex_ai` → routes to `https://{vertex_location}-aiplatform.googleapis.com/`
+2. `/vertex_ai/discovery` → routes to [`https://discoveryengine.googleapis.com`](https://discoveryengine.googleapis.com/)
+
+## How to use
+
 Just replace `https://REGION-aiplatform.googleapis.com` with `LITELLM_PROXY_BASE_URL/vertex_ai`

 LiteLLM supports 3 flows for calling Vertex AI endpoints via pass-through:
@ -213,7 +222,7 @@ curl http://localhost:4000/vertex-ai/v1/projects/${PROJECT_ID}/locations/us-cent

 LiteLLM Proxy Server supports two methods of authentication to Vertex AI:

-1. Pass Vertex Credetials client side to proxy server
+1. Pass Vertex Credentials client side to proxy server

 2. Set Vertex AI credentials on proxy server

--- a/docs/my-website/docs/pass_through/vllm.md
+++ b/docs/my-website/docs/pass_through/vllm.md
@ -0,0 +1,185 @@
+# VLLM
+
+Pass-through endpoints for VLLM - call provider-specific endpoint, in native format (no translation).
+
+| Feature | Supported | Notes | 
+|-------|-------|-------|
+| Cost Tracking | ❌ | Not supported |
+| Logging | ✅ | works across all integrations |
+| End-user Tracking | ❌ | [Tell us if you need this](https://github.com/BerriAI/litellm/issues/new) |
+| Streaming | ✅ | |
+
+Just replace `https://my-vllm-server.com` with `LITELLM_PROXY_BASE_URL/vllm` 🚀
+
+#### **Example Usage**
+
+```bash
+curl -L -X GET 'http://0.0.0.0:4000/vllm/metrics' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+```
+
+Supports **ALL** VLLM Endpoints (including streaming).
+
+## Quick Start
+
+Let's call the VLLM [`/metrics` endpoint](https://vllm.readthedocs.io/en/latest/api_reference/api_reference.html)
+
+1. Add HOSTED VLLM API BASE to your environment 
+
+```bash
+export HOSTED_VLLM_API_BASE="https://my-vllm-server.com"
+```
+
+2. Start LiteLLM Proxy 
+
+```bash
+litellm
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+3. Test it! 
+
+Let's call the VLLM `/metrics` endpoint
+
+```bash
+curl -L -X GET 'http://0.0.0.0:4000/vllm/metrics' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+```
+
+
+## Examples
+
+Anything after `http://0.0.0.0:4000/vllm` is treated as a provider-specific route, and handled accordingly.
+
+Key Changes: 
+
+| **Original Endpoint**                                | **Replace With**                  |
+|------------------------------------------------------|-----------------------------------|
+| `https://my-vllm-server.com`          | `http://0.0.0.0:4000/vllm` (LITELLM_PROXY_BASE_URL="http://0.0.0.0:4000")      |
+| `bearer $VLLM_API_KEY`                                 | `bearer anything` (use `bearer LITELLM_VIRTUAL_KEY` if Virtual Keys are setup on proxy)                    |
+
+
+### **Example 1: Metrics endpoint**
+
+#### LiteLLM Proxy Call 
+
+```bash
+curl -L -X GET 'http://0.0.0.0:4000/vllm/metrics' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer $LITELLM_VIRTUAL_KEY' \
+```
+
+
+#### Direct VLLM API Call 
+
+```bash
+curl -L -X GET 'https://my-vllm-server.com/metrics' \
+-H 'Content-Type: application/json' \
+```
+
+### **Example 2: Chat API**
+
+#### LiteLLM Proxy Call 
+
+```bash
+curl -L -X POST 'http://0.0.0.0:4000/vllm/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer $LITELLM_VIRTUAL_KEY' \
+-d '{
+    "messages": [
+        {
+            "role": "user",
+            "content": "I am going to Paris, what should I see?"
+        }
+    ],
+    "max_tokens": 2048,
+    "temperature": 0.8,
+    "top_p": 0.1,
+    "model": "qwen2.5-7b-instruct",
+}'
+```
+
+#### Direct VLLM API Call 
+
+```bash
+curl -L -X POST 'https://my-vllm-server.com/chat/completions' \
+-H 'Content-Type: application/json' \
+-d '{
+    "messages": [
+        {
+            "role": "user",
+            "content": "I am going to Paris, what should I see?"
+        }
+    ],
+    "max_tokens": 2048,
+    "temperature": 0.8,
+    "top_p": 0.1,
+    "model": "qwen2.5-7b-instruct",
+}'
+```
+
+
+## Advanced - Use with Virtual Keys 
+
+Pre-requisites
+- [Setup proxy with DB](../proxy/virtual_keys.md#setup)
+
+Use this, to avoid giving developers the raw Cohere API key, but still letting them use Cohere endpoints.
+
+### Usage
+
+1. Setup environment
+
+```bash
+export DATABASE_URL=""
+export LITELLM_MASTER_KEY=""
+export HOSTED_VLLM_API_BASE=""
+```
+
+```bash
+litellm
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+2. Generate virtual key 
+
+```bash
+curl -X POST 'http://0.0.0.0:4000/key/generate' \
+-H 'Authorization: Bearer sk-1234' \
+-H 'Content-Type: application/json' \
+-d '{}'
+```
+
+Expected Response 
+
+```bash
+{
+    ...
+    "key": "sk-1234ewknldferwedojwojw"
+}
+```
+
+3. Test it! 
+
+
+```bash
+curl -L -X POST 'http://0.0.0.0:4000/vllm/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234ewknldferwedojwojw' \
+  --data '{
+    "messages": [
+        {
+            "role": "user",
+            "content": "I am going to Paris, what should I see?"
+        }
+    ],
+    "max_tokens": 2048,
+    "temperature": 0.8,
+    "top_p": 0.1,
+    "model": "qwen2.5-7b-instruct",
+}'
+```
--- a/docs/my-website/docs/providers/anthropic.md
+++ b/docs/my-website/docs/providers/anthropic.md
@ -1095,7 +1095,7 @@ response = completion(
 print(response.choices[0])
 ```
 </TabItem>
-<TabItem value="proxy" lable="PROXY">
+<TabItem value="proxy" label="PROXY">

 1. Add model to config 

--- a/docs/my-website/docs/providers/azure.md
+++ b/docs/my-website/docs/providers/azure.md
@ -478,12 +478,12 @@ response.stream_to_file(speech_file_path)
 ## **Authentication**


-### Entrata ID - use `azure_ad_token`
+### Entra ID - use `azure_ad_token`

 This is a walkthrough on how to use Azure Active Directory Tokens - Microsoft Entra ID to make `litellm.completion()` calls 

 Step 1 - Download Azure CLI 
-Installation instructons: https://learn.microsoft.com/en-us/cli/azure/install-azure-cli
+Installation instructions: https://learn.microsoft.com/en-us/cli/azure/install-azure-cli
 ```shell
 brew update && brew install azure-cli
 ```
@ -545,7 +545,7 @@ model_list:
 </TabItem>
 </Tabs>

-### Entrata ID - use tenant_id, client_id, client_secret
+### Entra ID - use tenant_id, client_id, client_secret

 Here is an example of setting up `tenant_id`, `client_id`, `client_secret` in your litellm proxy `config.yaml`
 ```yaml
@ -581,7 +581,7 @@ Example video of using `tenant_id`, `client_id`, `client_secret` with LiteLLM Pr

 <iframe width="840" height="500" src="https://www.loom.com/embed/70d3f219ee7f4e5d84778b7f17bba506?sid=04b8ff29-485f-4cb8-929e-6b392722f36d" frameborder="0" webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>

-### Entrata ID - use client_id, username, password
+### Entra ID - use client_id, username, password

 Here is an example of setting up `client_id`, `azure_username`, `azure_password` in your litellm proxy `config.yaml`
 ```yaml
@ -1002,8 +1002,125 @@ Expected Response:
 ```


+## **Azure Responses API**

+| Property | Details |
+|-------|-------|
+| Description | Azure OpenAI Responses API |
+| `custom_llm_provider` on LiteLLM | `azure/` |
+| Supported Operations | `/v1/responses`|
+| Azure OpenAI Responses API | [Azure OpenAI Responses API ↗](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/responses?tabs=python-secure) |
+| Cost Tracking, Logging Support | ✅ LiteLLM will log, track cost for Responses API Requests |
+| Supported OpenAI Params | ✅ All OpenAI params are supported, [See here](https://github.com/BerriAI/litellm/blob/0717369ae6969882d149933da48eeb8ab0e691bd/litellm/llms/openai/responses/transformation.py#L23) |

+## Usage
+
+## Create a model response
+
+<Tabs>
+<TabItem value="litellm-sdk" label="LiteLLM SDK">
+
+#### Non-streaming
+
+```python showLineNumbers title="Azure Responses API"
+import litellm
+
+# Non-streaming response
+response = litellm.responses(
+    model="azure/o1-pro",
+    input="Tell me a three sentence bedtime story about a unicorn.",
+    max_output_tokens=100,
+    api_key=os.getenv("AZURE_RESPONSES_OPENAI_API_KEY"),
+    api_base="https://litellm8397336933.openai.azure.com/",
+    api_version="2023-03-15-preview",
+)
+
+print(response)
+```
+
+#### Streaming
+```python showLineNumbers title="Azure Responses API"
+import litellm
+
+# Streaming response
+response = litellm.responses(
+    model="azure/o1-pro",
+    input="Tell me a three sentence bedtime story about a unicorn.",
+    stream=True,
+    api_key=os.getenv("AZURE_RESPONSES_OPENAI_API_KEY"),
+    api_base="https://litellm8397336933.openai.azure.com/",
+    api_version="2023-03-15-preview",
+)
+
+for event in response:
+    print(event)
+```
+
+</TabItem>
+<TabItem value="proxy" label="OpenAI SDK with LiteLLM Proxy">
+
+First, add this to your litellm proxy config.yaml:
+```yaml showLineNumbers title="Azure Responses API"
+model_list:
+  - model_name: o1-pro
+    litellm_params:
+      model: azure/o1-pro
+      api_key: os.environ/AZURE_RESPONSES_OPENAI_API_KEY
+      api_base: https://litellm8397336933.openai.azure.com/
+      api_version: 2023-03-15-preview
+```
+
+Start your LiteLLM proxy:
+```bash
+litellm --config /path/to/config.yaml
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+Then use the OpenAI SDK pointed to your proxy:
+
+#### Non-streaming
+```python showLineNumbers
+from openai import OpenAI
+
+# Initialize client with your proxy URL
+client = OpenAI(
+    base_url="http://localhost:4000",  # Your proxy URL
+    api_key="your-api-key"             # Your proxy API key
+)
+
+# Non-streaming response
+response = client.responses.create(
+    model="o1-pro",
+    input="Tell me a three sentence bedtime story about a unicorn."
+)
+
+print(response)
+```
+
+#### Streaming
+```python showLineNumbers
+from openai import OpenAI
+
+# Initialize client with your proxy URL
+client = OpenAI(
+    base_url="http://localhost:4000",  # Your proxy URL
+    api_key="your-api-key"             # Your proxy API key
+)
+
+# Streaming response
+response = client.responses.create(
+    model="o1-pro",
+    input="Tell me a three sentence bedtime story about a unicorn.",
+    stream=True
+)
+
+for event in response:
+    print(event)
+```
+
+</TabItem>
+</Tabs>



@ -1076,32 +1193,24 @@ print(response)
 ```


-### Parallel Function calling
+### Tool Calling / Function Calling
+
 See a detailed walthrough of parallel function calling with litellm [here](https://docs.litellm.ai/docs/completion/function_call)
+
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
 ```python
 # set Azure env variables
 import os
+import litellm
+import json
+
 os.environ['AZURE_API_KEY'] = "" # litellm reads AZURE_API_KEY from .env and sends the request
 os.environ['AZURE_API_BASE'] = "https://openai-gpt-4-test-v-1.openai.azure.com/"
 os.environ['AZURE_API_VERSION'] = "2023-07-01-preview"

-import litellm
-import json
-# Example dummy function hard coded to return the same weather
-# In production, this could be your backend API or an external API
-def get_current_weather(location, unit="fahrenheit"):
-    """Get the current weather in a given location"""
-    if "tokyo" in location.lower():
-        return json.dumps({"location": "Tokyo", "temperature": "10", "unit": "celsius"})
-    elif "san francisco" in location.lower():
-        return json.dumps({"location": "San Francisco", "temperature": "72", "unit": "fahrenheit"})
-    elif "paris" in location.lower():
-        return json.dumps({"location": "Paris", "temperature": "22", "unit": "celsius"})
-    else:
-        return json.dumps({"location": location, "temperature": "unknown"})
-
-## Step 1: send the conversation and available functions to the model
-messages = [{"role": "user", "content": "What's the weather like in San Francisco, Tokyo, and Paris?"}]
 tools = [
    {
        "type": "function",
@ -1125,7 +1234,7 @@ tools = [

 response = litellm.completion(
    model="azure/chatgpt-functioncalling", # model = azure/<your-azure-deployment-name>
-    messages=messages,
+    messages=[{"role": "user", "content": "What's the weather like in San Francisco, Tokyo, and Paris?"}],
    tools=tools,
    tool_choice="auto",  # auto is default, but we'll be explicit
 )
@ -1134,8 +1243,49 @@ response_message = response.choices[0].message
 tool_calls = response.choices[0].message.tool_calls
 print("\nTool Choice:\n", tool_calls)
 ```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+  - model_name: azure-gpt-3.5
+    litellm_params:
+      model: azure/chatgpt-functioncalling
+      api_base: os.environ/AZURE_API_BASE
+      api_key: os.environ/AZURE_API_KEY
+      api_version: "2023-07-01-preview"
+```
+
+2. Start proxy
+
+```bash
+litellm --config config.yaml
+```
+
+3. Test it
+
+```bash
+curl -L -X POST 'http://localhost:4000/v1/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+    "model": "azure-gpt-3.5",
+    "messages": [
+        {
+            "role": "user",
+            "content": "Hey, how'\''s it going? Thinking long and hard before replying - what is the meaning of the world and life itself"
+        }
+    ]
+}'
+```


+
+
+</TabItem>
+</Tabs>
 ### Spend Tracking for Azure OpenAI Models (PROXY)

 Set base model for cost tracking azure image-gen call
--- a/docs/my-website/docs/providers/databricks.md
+++ b/docs/my-website/docs/providers/databricks.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# 🆕 Databricks
+# Databricks

 LiteLLM supports all models on Databricks

@ -154,7 +154,205 @@ response = completion(
        temperature: 0.5
 ```

-## Passings Databricks specific params - 'instruction'
+
+## Usage - Thinking / `reasoning_content`
+
+LiteLLM translates OpenAI's `reasoning_effort` to Anthropic's `thinking` parameter. [Code](https://github.com/BerriAI/litellm/blob/23051d89dd3611a81617d84277059cd88b2df511/litellm/llms/anthropic/chat/transformation.py#L298)
+
+| reasoning_effort | thinking |
+| ---------------- | -------- |
+| "low"            | "budget_tokens": 1024 |
+| "medium"         | "budget_tokens": 2048 |
+| "high"           | "budget_tokens": 4096 |
+
+
+Known Limitations:
+- Support for passing thinking blocks back to Claude [Issue](https://github.com/BerriAI/litellm/issues/9790)
+ 
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+import os
+
+# set ENV variables (can also be passed in to .completion() - e.g. `api_base`, `api_key`)
+os.environ["DATABRICKS_API_KEY"] = "databricks key"
+os.environ["DATABRICKS_API_BASE"] = "databricks base url"
+
+resp = completion(
+    model="databricks/databricks-claude-3-7-sonnet",
+    messages=[{"role": "user", "content": "What is the capital of France?"}],
+    reasoning_effort="low",
+)
+
+```
+
+</TabItem>
+
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+- model_name: claude-3-7-sonnet
+  litellm_params:
+    model: databricks/databricks-claude-3-7-sonnet
+    api_key: os.environ/DATABRICKS_API_KEY
+    api_base: os.environ/DATABRICKS_API_BASE
+```
+
+2. Start proxy
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```bash
+curl http://0.0.0.0:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer <YOUR-LITELLM-KEY>" \
+  -d '{
+    "model": "claude-3-7-sonnet",
+    "messages": [{"role": "user", "content": "What is the capital of France?"}],
+    "reasoning_effort": "low"
+  }'
+```
+
+</TabItem>
+</Tabs>
+
+
+**Expected Response**
+
+```python
+ModelResponse(
+    id='chatcmpl-c542d76d-f675-4e87-8e5f-05855f5d0f5e',
+    created=1740470510,
+    model='claude-3-7-sonnet-20250219',
+    object='chat.completion',
+    system_fingerprint=None,
+    choices=[
+        Choices(
+            finish_reason='stop',
+            index=0,
+            message=Message(
+                content="The capital of France is Paris.",
+                role='assistant',
+                tool_calls=None,
+                function_call=None,
+                provider_specific_fields={
+                    'citations': None,
+                    'thinking_blocks': [
+                        {
+                            'type': 'thinking',
+                            'thinking': 'The capital of France is Paris. This is a very straightforward factual question.',
+                            'signature': 'EuYBCkQYAiJAy6...'
+                        }
+                    ]
+                }
+            ),
+            thinking_blocks=[
+                {
+                    'type': 'thinking',
+                    'thinking': 'The capital of France is Paris. This is a very straightforward factual question.',
+                    'signature': 'EuYBCkQYAiJAy6AGB...'
+                }
+            ],
+            reasoning_content='The capital of France is Paris. This is a very straightforward factual question.'
+        )
+    ],
+    usage=Usage(
+        completion_tokens=68,
+        prompt_tokens=42,
+        total_tokens=110,
+        completion_tokens_details=None,
+        prompt_tokens_details=PromptTokensDetailsWrapper(
+            audio_tokens=None,
+            cached_tokens=0,
+            text_tokens=None,
+            image_tokens=None
+        ),
+        cache_creation_input_tokens=0,
+        cache_read_input_tokens=0
+    )
+)
+```
+
+### Pass `thinking` to Anthropic models
+
+You can also pass the `thinking` parameter to Anthropic models.
+
+
+You can also pass the `thinking` parameter to Anthropic models.
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+import os
+
+# set ENV variables (can also be passed in to .completion() - e.g. `api_base`, `api_key`)
+os.environ["DATABRICKS_API_KEY"] = "databricks key"
+os.environ["DATABRICKS_API_BASE"] = "databricks base url"
+
+response = litellm.completion(
+  model="databricks/databricks-claude-3-7-sonnet",
+  messages=[{"role": "user", "content": "What is the capital of France?"}],
+  thinking={"type": "enabled", "budget_tokens": 1024},
+)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+```bash
+curl http://0.0.0.0:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $LITELLM_KEY" \
+  -d '{
+    "model": "databricks/databricks-claude-3-7-sonnet",
+    "messages": [{"role": "user", "content": "What is the capital of France?"}],
+    "thinking": {"type": "enabled", "budget_tokens": 1024}
+  }'
+```
+
+</TabItem>
+</Tabs>
+
+
+
+
+
+## Supported Databricks Chat Completion Models 
+
+:::tip
+
+**We support ALL Databricks models, just set `model=databricks/<any-model-on-databricks>` as a prefix when sending litellm requests**
+
+:::
+
+
+| Model Name                 | Command                                                          |
+|----------------------------|------------------------------------------------------------------|
+| databricks/databricks-claude-3-7-sonnet    | `completion(model='databricks/databricks/databricks-claude-3-7-sonnet', messages=messages)`   | 
+| databricks-meta-llama-3-1-70b-instruct    | `completion(model='databricks/databricks-meta-llama-3-1-70b-instruct', messages=messages)`   | 
+| databricks-meta-llama-3-1-405b-instruct    | `completion(model='databricks/databricks-meta-llama-3-1-405b-instruct', messages=messages)`   | 
+| databricks-dbrx-instruct    | `completion(model='databricks/databricks-dbrx-instruct', messages=messages)`   | 
+| databricks-meta-llama-3-70b-instruct    | `completion(model='databricks/databricks-meta-llama-3-70b-instruct', messages=messages)`   | 
+| databricks-llama-2-70b-chat    | `completion(model='databricks/databricks-llama-2-70b-chat', messages=messages)`   | 
+| databricks-mixtral-8x7b-instruct    | `completion(model='databricks/databricks-mixtral-8x7b-instruct', messages=messages)`   | 
+| databricks-mpt-30b-instruct    | `completion(model='databricks/databricks-mpt-30b-instruct', messages=messages)`   | 
+| databricks-mpt-7b-instruct    | `completion(model='databricks/databricks-mpt-7b-instruct', messages=messages)`   | 
+
+
+## Embedding Models
+
+### Passing Databricks specific params - 'instruction'

 For embedding models, databricks lets you pass in an additional param 'instruction'. [Full Spec](https://github.com/BerriAI/litellm/blob/43353c28b341df0d9992b45c6ce464222ebd7984/litellm/llms/databricks.py#L164)

@ -187,27 +385,6 @@ response = litellm.embedding(
        instruction: "Represent this sentence for searching relevant passages:"
 ```

-
-## Supported Databricks Chat Completion Models 
-
-:::tip
-
-**We support ALL Databricks models, just set `model=databricks/<any-model-on-databricks>` as a prefix when sending litellm requests**
-
-:::
-
-
-| Model Name                 | Command                                                          |
-|----------------------------|------------------------------------------------------------------|
-| databricks-meta-llama-3-1-70b-instruct    | `completion(model='databricks/databricks-meta-llama-3-1-70b-instruct', messages=messages)`   | 
-| databricks-meta-llama-3-1-405b-instruct    | `completion(model='databricks/databricks-meta-llama-3-1-405b-instruct', messages=messages)`   | 
-| databricks-dbrx-instruct    | `completion(model='databricks/databricks-dbrx-instruct', messages=messages)`   | 
-| databricks-meta-llama-3-70b-instruct    | `completion(model='databricks/databricks-meta-llama-3-70b-instruct', messages=messages)`   | 
-| databricks-llama-2-70b-chat    | `completion(model='databricks/databricks-llama-2-70b-chat', messages=messages)`   | 
-| databricks-mixtral-8x7b-instruct    | `completion(model='databricks/databricks-mixtral-8x7b-instruct', messages=messages)`   | 
-| databricks-mpt-30b-instruct    | `completion(model='databricks/databricks-mpt-30b-instruct', messages=messages)`   | 
-| databricks-mpt-7b-instruct    | `completion(model='databricks/databricks-mpt-7b-instruct', messages=messages)`   | 
-
 ## Supported Databricks Embedding Models 

 :::tip
--- a/docs/my-website/docs/providers/gemini.md
+++ b/docs/my-website/docs/providers/gemini.md
@ -39,14 +39,164 @@ response = completion(
 - temperature
 - top_p
 - max_tokens
+- max_completion_tokens
 - stream
 - tools
 - tool_choice
+- functions
 - response_format
 - n
 - stop
+- logprobs
+- frequency_penalty
+- modalities
+- reasoning_content
+
+**Anthropic Params**
+- thinking (used to set max budget tokens across anthropic/gemini models)
+
+[**See Updated List**](https://github.com/BerriAI/litellm/blob/main/litellm/llms/gemini/chat/transformation.py#L70)
+
+
+
+## Usage - Thinking / `reasoning_content`
+
+LiteLLM translates OpenAI's `reasoning_effort` to Gemini's `thinking` parameter. [Code](https://github.com/BerriAI/litellm/blob/620664921902d7a9bfb29897a7b27c1a7ef4ddfb/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py#L362)
+
+**Mapping**
+
+| reasoning_effort | thinking |
+| ---------------- | -------- |
+| "low"            | "budget_tokens": 1024 |
+| "medium"         | "budget_tokens": 2048 |
+| "high"           | "budget_tokens": 4096 |
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+
+resp = completion(
+    model="gemini/gemini-2.5-flash-preview-04-17",
+    messages=[{"role": "user", "content": "What is the capital of France?"}],
+    reasoning_effort="low",
+)
+
+```
+
+</TabItem>
+
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+- model_name: gemini-2.5-flash
+  litellm_params:
+    model: gemini/gemini-2.5-flash-preview-04-17
+    api_key: os.environ/GEMINI_API_KEY
+```
+
+2. Start proxy
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```bash
+curl http://0.0.0.0:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer <YOUR-LITELLM-KEY>" \
+  -d '{
+    "model": "gemini-2.5-flash",
+    "messages": [{"role": "user", "content": "What is the capital of France?"}],
+    "reasoning_effort": "low"
+  }'
+```
+
+</TabItem>
+</Tabs>
+
+
+**Expected Response**
+
+```python
+ModelResponse(
+    id='chatcmpl-c542d76d-f675-4e87-8e5f-05855f5d0f5e',
+    created=1740470510,
+    model='claude-3-7-sonnet-20250219',
+    object='chat.completion',
+    system_fingerprint=None,
+    choices=[
+        Choices(
+            finish_reason='stop',
+            index=0,
+            message=Message(
+                content="The capital of France is Paris.",
+                role='assistant',
+                tool_calls=None,
+                function_call=None,
+                reasoning_content='The capital of France is Paris. This is a very straightforward factual question.'
+            ),
+        )
+    ],
+    usage=Usage(
+        completion_tokens=68,
+        prompt_tokens=42,
+        total_tokens=110,
+        completion_tokens_details=None,
+        prompt_tokens_details=PromptTokensDetailsWrapper(
+            audio_tokens=None,
+            cached_tokens=0,
+            text_tokens=None,
+            image_tokens=None
+        ),
+        cache_creation_input_tokens=0,
+        cache_read_input_tokens=0
+    )
+)
+```
+
+### Pass `thinking` to Gemini models
+
+You can also pass the `thinking` parameter to Gemini models.
+
+This is translated to Gemini's [`thinkingConfig` parameter](https://ai.google.dev/gemini-api/docs/thinking#set-budget).
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+response = litellm.completion(
+  model="gemini/gemini-2.5-flash-preview-04-17",
+  messages=[{"role": "user", "content": "What is the capital of France?"}],
+  thinking={"type": "enabled", "budget_tokens": 1024},
+)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+```bash
+curl http://0.0.0.0:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $LITELLM_KEY" \
+  -d '{
+    "model": "gemini/gemini-2.5-flash-preview-04-17",
+    "messages": [{"role": "user", "content": "What is the capital of France?"}],
+    "thinking": {"type": "enabled", "budget_tokens": 1024}
+  }'
+```
+
+</TabItem>
+</Tabs>
+
+
+

-[**See Updated List**](https://github.com/BerriAI/litellm/blob/1c747f3ad372399c5b95cc5696b06a5fbe53186b/litellm/llms/vertex_httpx.py#L122)

 ## Passing Gemini Specific Params
 ### Response schema 
@ -438,6 +588,179 @@ assert isinstance(
 ```


+### Google Search Tool
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+import os
+
+os.environ["GEMINI_API_KEY"] = ".."
+
+tools = [{"googleSearch": {}}] # 👈 ADD GOOGLE SEARCH
+
+response = completion(
+    model="gemini/gemini-2.0-flash",
+    messages=[{"role": "user", "content": "What is the weather in San Francisco?"}],
+    tools=tools,
+)
+
+print(response)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+```yaml
+model_list:
+  - model_name: gemini-2.0-flash
+    litellm_params:
+      model: gemini/gemini-2.0-flash
+      api_key: os.environ/GEMINI_API_KEY
+```
+
+2. Start Proxy
+```bash
+$ litellm --config /path/to/config.yaml
+```
+
+3. Make Request!
+```bash
+curl -X POST 'http://0.0.0.0:4000/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+  "model": "gemini-2.0-flash",
+  "messages": [{"role": "user", "content": "What is the weather in San Francisco?"}],
+  "tools": [{"googleSearch": {}}]
+}
+'
+```
+
+</TabItem>
+</Tabs>
+
+### Google Search Retrieval
+
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+import os
+
+os.environ["GEMINI_API_KEY"] = ".."
+
+tools = [{"googleSearch": {}}] # 👈 ADD GOOGLE SEARCH
+
+response = completion(
+    model="gemini/gemini-2.0-flash",
+    messages=[{"role": "user", "content": "What is the weather in San Francisco?"}],
+    tools=tools,
+)
+
+print(response)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+```yaml
+model_list:
+  - model_name: gemini-2.0-flash
+    litellm_params:
+      model: gemini/gemini-2.0-flash
+      api_key: os.environ/GEMINI_API_KEY
+```
+
+2. Start Proxy
+```bash
+$ litellm --config /path/to/config.yaml
+```
+
+3. Make Request!
+```bash
+curl -X POST 'http://0.0.0.0:4000/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+  "model": "gemini-2.0-flash",
+  "messages": [{"role": "user", "content": "What is the weather in San Francisco?"}],
+  "tools": [{"googleSearch": {}}]
+}
+'
+```
+
+</TabItem>
+</Tabs>
+
+
+### Code Execution Tool
+
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+import os
+
+os.environ["GEMINI_API_KEY"] = ".."
+
+tools = [{"codeExecution": {}}] # 👈 ADD GOOGLE SEARCH
+
+response = completion(
+    model="gemini/gemini-2.0-flash",
+    messages=[{"role": "user", "content": "What is the weather in San Francisco?"}],
+    tools=tools,
+)
+
+print(response)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+```yaml
+model_list:
+  - model_name: gemini-2.0-flash
+    litellm_params:
+      model: gemini/gemini-2.0-flash
+      api_key: os.environ/GEMINI_API_KEY
+```
+
+2. Start Proxy
+```bash
+$ litellm --config /path/to/config.yaml
+```
+
+3. Make Request!
+```bash
+curl -X POST 'http://0.0.0.0:4000/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+  "model": "gemini-2.0-flash",
+  "messages": [{"role": "user", "content": "What is the weather in San Francisco?"}],
+  "tools": [{"codeExecution": {}}]
+}
+'
+```
+
+</TabItem>
+</Tabs>
+
+
+
+
+
+
 ## JSON Mode

 <Tabs>
@ -887,3 +1210,54 @@ response = await client.chat.completions.create(

 </TabItem>
 </Tabs>
+
+## Image Generation
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion 
+
+response = completion(
+    model="gemini/gemini-2.0-flash-exp-image-generation",
+    messages=[{"role": "user", "content": "Generate an image of a cat"}],
+    modalities=["image", "text"],
+)
+assert response.choices[0].message.content is not None # "data:image/png;base64,e4rr.."
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+  - model_name: gemini-2.0-flash-exp-image-generation
+    litellm_params:
+      model: gemini/gemini-2.0-flash-exp-image-generation
+      api_key: os.environ/GEMINI_API_KEY
+```
+
+2. Start proxy
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it!
+
+```bash
+curl -L -X POST 'http://localhost:4000/v1/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+    "model": "gemini-2.0-flash-exp-image-generation",
+    "messages": [{"role": "user", "content": "Generate an image of a cat"}],
+    "modalities": ["image", "text"]
+}'
+```
+
+</TabItem>
+</Tabs>
+
--- a/docs/my-website/docs/providers/google_ai_studio/files.md
+++ b/docs/my-website/docs/providers/google_ai_studio/files.md
@ -0,0 +1,161 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# [BETA] Google AI Studio (Gemini) Files API
+
+Use this to upload files to Google AI Studio (Gemini).
+
+Useful to pass in large media files to Gemini's `/generateContent` endpoint.
+
+| Action | Supported | 
+|----------|-----------|
+| `create` | Yes |
+| `delete` | No |
+| `retrieve` | No |
+| `list` | No |
+
+## Usage
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+import base64
+import requests
+from litellm import completion, create_file
+import os
+
+
+### UPLOAD FILE ### 
+
+# Fetch the audio file and convert it to a base64 encoded string
+url = "https://cdn.openai.com/API/docs/audio/alloy.wav"
+response = requests.get(url)
+response.raise_for_status()
+wav_data = response.content
+encoded_string = base64.b64encode(wav_data).decode('utf-8')
+
+
+file = create_file(
+    file=wav_data,
+    purpose="user_data",
+    extra_body={"custom_llm_provider": "gemini"},
+    api_key=os.getenv("GEMINI_API_KEY"),
+)
+
+print(f"file: {file}")
+
+assert file is not None
+
+
+### GENERATE CONTENT ### 
+completion = completion(
+    model="gemini-2.0-flash",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                { 
+                    "type": "text",
+                    "text": "What is in this recording?"
+                },
+                {
+                    "type": "file",
+                    "file": {
+                        "file_id": file.id,
+                        "filename": "my-test-name",
+                        "format": "audio/wav"
+                    }
+                }
+            ]
+        },
+    ]
+)
+
+print(completion.choices[0].message)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+    - model_name: "gemini-2.0-flash"
+      litellm_params:
+        model: gemini/gemini-2.0-flash
+        api_key: os.environ/GEMINI_API_KEY
+```
+
+2. Start proxy
+
+```bash
+litellm --config config.yaml
+```
+
+3. Test it
+
+```python
+import base64
+import requests
+from openai import OpenAI
+
+client = OpenAI(
+    base_url="http://0.0.0.0:4000",
+    api_key="sk-1234"
+)
+
+# Fetch the audio file and convert it to a base64 encoded string
+url = "https://cdn.openai.com/API/docs/audio/alloy.wav"
+response = requests.get(url)
+response.raise_for_status()
+wav_data = response.content
+encoded_string = base64.b64encode(wav_data).decode('utf-8')
+
+
+file = client.files.create(
+    file=wav_data,
+    purpose="user_data",
+    extra_body={"target_model_names": "gemini-2.0-flash"}
+)
+
+print(f"file: {file}")
+
+assert file is not None
+
+completion = client.chat.completions.create(
+    model="gemini-2.0-flash",
+    modalities=["text", "audio"],
+    audio={"voice": "alloy", "format": "wav"},
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                { 
+                    "type": "text",
+                    "text": "What is in this recording?"
+                },
+                {
+                    "type": "file",
+                    "file": {
+                        "file_id": file.id,
+                        "filename": "my-test-name",
+                        "format": "audio/wav"
+                    }
+                }
+            ]
+        },
+    ],
+    extra_body={"drop_params": True}
+)
+
+print(completion.choices[0].message)
+```
+
+
+
+
+</TabItem>
+</Tabs>
+
--- a/docs/my-website/docs/providers/huggingface.md
+++ b/docs/my-website/docs/providers/huggingface.md
@ -2,466 +2,392 @@ import Image from '@theme/IdealImage';
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# Huggingface
+# Hugging Face
+LiteLLM supports running inference across multiple services for models hosted on the Hugging Face Hub.

-LiteLLM supports the following types of Hugging Face models:
+- **Serverless Inference Providers** - Hugging Face offers an easy and unified access to serverless AI inference through multiple inference providers, like [Together AI](https://together.ai) and [Sambanova](https://sambanova.ai). This is the fastest way to integrate AI in your products with a maintenance-free and scalable solution. More details in the [Inference Providers documentation](https://huggingface.co/docs/inference-providers/index).
+- **Dedicated Inference Endpoints** - which is a product to easily deploy models to production. Inference is run by Hugging Face in a dedicated, fully managed infrastructure on a cloud provider of your choice. You can deploy your model on Hugging Face Inference Endpoints by following [these steps](https://huggingface.co/docs/inference-endpoints/guides/create_endpoint).

- Serverless Inference API (free) - loaded and ready to use: https://huggingface.co/models?inference=warm&pipeline_tag=text-generation
- Dedicated Inference Endpoints (paid) - manual deployment: https://ui.endpoints.huggingface.co/
- All LLMs served via Hugging Face's Inference use [Text-generation-inference](https://huggingface.co/docs/text-generation-inference). 
+
+## Supported Models
+
+### Serverless Inference Providers
+You can check available models for an inference provider by going to [huggingface.co/models](https://huggingface.co/models), clicking the "Other" filter tab, and selecting your desired provider:
+
+![Filter models by Inference Provider](../../img/hf_filter_inference_providers.png)
+
+For example, you can find all Fireworks supported models [here](https://huggingface.co/models?inference_provider=fireworks-ai&sort=trending).
+
+
+### Dedicated Inference Endpoints
+Refer to the [Inference Endpoints catalog](https://endpoints.huggingface.co/catalog) for a list of available models.

 ## Usage

-<a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/LiteLLM_HuggingFace.ipynb">
-  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
-</a>
-
-You need to tell LiteLLM when you're calling Huggingface.
-This is done by adding the "huggingface/" prefix to `model`, example `completion(model="huggingface/<model_name>",...)`.
-
 <Tabs>
-<TabItem value="serverless" label="Serverless Inference API">
+<TabItem value="serverless" label="Serverless Inference Providers">

-By default, LiteLLM will assume a Hugging Face call follows the [Messages API](https://huggingface.co/docs/text-generation-inference/messages_api), which is fully compatible with the OpenAI Chat Completion API.
+### Authentication
+With a single Hugging Face token, you can access inference through multiple providers. Your calls are routed through Hugging Face and the usage is billed directly to your Hugging Face account at the standard provider API rates.

-<Tabs>
-<TabItem value="sdk" label="SDK">
-
-```python
-import os
-from litellm import completion
-
-# [OPTIONAL] set env var
-os.environ["HUGGINGFACE_API_KEY"] = "huggingface_api_key"
-
-messages = [{ "content": "There's a llama in my garden 😱 What should I do?","role": "user"}]
-
-# e.g. Call 'https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct' from Serverless Inference API
-response = completion(
-    model="huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct",
-    messages=[{ "content": "Hello, how are you?","role": "user"}],
-    stream=True
-)
-
-print(response)
-```
-
-</TabItem>
-<TabItem value="proxy" label="PROXY">
-
-1. Add models to your config.yaml
-
-```yaml
-model_list:
-  - model_name: llama-3.1-8B-instruct
-    litellm_params:
-      model: huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct
-      api_key: os.environ/HUGGINGFACE_API_KEY
-```
-
-2. Start the proxy
+Simply set the `HF_TOKEN` environment variable with your Hugging Face token, you can create one here: https://huggingface.co/settings/tokens.

 ```bash
-$ litellm --config /path/to/config.yaml --debug
+export HF_TOKEN="hf_xxxxxx"
+```
+or alternatively, you can pass your Hugging Face token as a parameter:
+```python
+completion(..., api_key="hf_xxxxxx")
 ```

-3. Test it!
+### Getting Started

-```shell
-curl --location 'http://0.0.0.0:4000/chat/completions' \
-    --header 'Authorization: Bearer sk-1234' \
-    --header 'Content-Type: application/json' \
-    --data '{
-    "model": "llama-3.1-8B-instruct",
-    "messages": [
-      {
-          "role": "user",
-          "content": "I like you!"
-      }
-      ],
-}'
+To use a Hugging Face model, specify both the provider and model you want to use in the following format:
 ```
+huggingface/<provider>/<hf_org_or_user>/<hf_model>
+```
+Where `<hf_org_or_user>/<hf_model>` is the Hugging Face model ID and `<provider>` is the inference provider.  
+By default, if you don't specify a provider, LiteLLM will use the [HF Inference API](https://huggingface.co/docs/api-inference/en/index).

-</TabItem> 
-</Tabs>
-</TabItem>
-<TabItem value="classification" label="Text Classification">
-
-Append `text-classification` to the model name
-
-e.g. `huggingface/text-classification/<model-name>`
-
-<Tabs>
-<TabItem value="sdk" label="SDK">
+Examples:

 ```python
-import os
-from litellm import completion
+# Run DeepSeek-R1 inference through Together AI
+completion(model="huggingface/together/deepseek-ai/DeepSeek-R1",...)

-# [OPTIONAL] set env var
-os.environ["HUGGINGFACE_API_KEY"] = "huggingface_api_key"
+# Run Qwen2.5-72B-Instruct inference through Sambanova
+completion(model="huggingface/sambanova/Qwen/Qwen2.5-72B-Instruct",...)

-messages = [{ "content": "I like you, I love you!","role": "user"}]
-
-# e.g. Call 'shahrukhx01/question-vs-statement-classifier' hosted on HF Inference endpoints
-response = completion(
-  model="huggingface/text-classification/shahrukhx01/question-vs-statement-classifier",
-  messages=messages,
-  api_base="https://my-endpoint.endpoints.huggingface.cloud",
-)
-
-print(response)
+# Run Llama-3.3-70B-Instruct inference through HF Inference API
+completion(model="huggingface/meta-llama/Llama-3.3-70B-Instruct",...)
 ```

-</TabItem> 
-<TabItem value="proxy" label="PROXY">
-
-1. Add models to your config.yaml
-
-```yaml
-model_list:
-  - model_name: bert-classifier
-    litellm_params:
-      model: huggingface/text-classification/shahrukhx01/question-vs-statement-classifier
-      api_key: os.environ/HUGGINGFACE_API_KEY
-      api_base: "https://my-endpoint.endpoints.huggingface.cloud"
-```
-
-2. Start the proxy
-
-```bash
-$ litellm --config /path/to/config.yaml --debug
-```
-
-3. Test it!
-
-```shell
-curl --location 'http://0.0.0.0:4000/chat/completions' \
-    --header 'Authorization: Bearer sk-1234' \
-    --header 'Content-Type: application/json' \
-    --data '{
-    "model": "bert-classifier",
-    "messages": [
-      {
-          "role": "user",
-          "content": "I like you!"
-      }
-      ],
-}'
-```
-
-</TabItem> 
-</Tabs>
-</TabItem>
-<TabItem value="dedicated" label="Dedicated Inference Endpoints">
-
-Steps to use
-* Create your own Hugging Face dedicated endpoint here: https://ui.endpoints.huggingface.co/
-* Set `api_base` to your deployed api base
-* Add the `huggingface/` prefix to your model so litellm knows it's a huggingface Deployed Inference Endpoint
-
-<Tabs>
-<TabItem value="sdk" label="SDK">
-
-```python
-import os
-from litellm import completion
-
-os.environ["HUGGINGFACE_API_KEY"] = ""
-
-# TGI model: Call https://huggingface.co/glaiveai/glaive-coder-7b
-# add the 'huggingface/' prefix to the model to set huggingface as the provider
-# set api base to your deployed api endpoint from hugging face
-response = completion(
-    model="huggingface/glaiveai/glaive-coder-7b",
-    messages=[{ "content": "Hello, how are you?","role": "user"}],
-    api_base="https://wjiegasee9bmqke2.us-east-1.aws.endpoints.huggingface.cloud"
-)
-print(response)
-```
-
-</TabItem>
-<TabItem value="proxy" label="PROXY">
-
-1. Add models to your config.yaml
-
-```yaml
-model_list:
-  - model_name: glaive-coder
-    litellm_params:
-      model: huggingface/glaiveai/glaive-coder-7b
-      api_key: os.environ/HUGGINGFACE_API_KEY
-      api_base: "https://wjiegasee9bmqke2.us-east-1.aws.endpoints.huggingface.cloud"
-```
-
-2. Start the proxy
-
-```bash
-$ litellm --config /path/to/config.yaml --debug
-```
-
-3. Test it!
-
-```shell
-curl --location 'http://0.0.0.0:4000/chat/completions' \
-    --header 'Authorization: Bearer sk-1234' \
-    --header 'Content-Type: application/json' \
-    --data '{
-    "model": "glaive-coder",
-    "messages": [
-      {
-          "role": "user",
-          "content": "I like you!"
-      }
-      ],
-}'
-```
-
-</TabItem> 
-</Tabs>
-
-</TabItem>
-</Tabs>
-
-## Streaming

 <a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/LiteLLM_HuggingFace.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
 </a>

-You need to tell LiteLLM when you're calling Huggingface.
-This is done by adding the "huggingface/" prefix to `model`, example `completion(model="huggingface/<model_name>",...)`.
+### Basic Completion
+Here's an example of chat completion using the DeepSeek-R1 model through Together AI:

 ```python
 import os
 from litellm import completion

-# [OPTIONAL] set env var
-os.environ["HUGGINGFACE_API_KEY"] = "huggingface_api_key"
+os.environ["HF_TOKEN"] = "hf_xxxxxx"

-messages = [{ "content": "There's a llama in my garden 😱 What should I do?","role": "user"}]
-
-# e.g. Call 'facebook/blenderbot-400M-distill' hosted on HF Inference endpoints
 response = completion(
-  model="huggingface/facebook/blenderbot-400M-distill",
-  messages=messages,
-  api_base="https://my-endpoint.huggingface.cloud",
-  stream=True
+    model="huggingface/together/deepseek-ai/DeepSeek-R1",
+    messages=[
+        {
+            "role": "user",
+            "content": "How many r's are in the word 'strawberry'?",
+        }
+    ],
+)
+print(response)
+```
+
+### Streaming
+Now, let's see what a streaming request looks like.
+
+```python
+import os
+from litellm import completion
+
+os.environ["HF_TOKEN"] = "hf_xxxxxx"
+
+response = completion(
+    model="huggingface/together/deepseek-ai/DeepSeek-R1",
+    messages=[
+        {
+            "role": "user",
+            "content": "How many r's are in the word `strawberry`?",
+            
+        }
+    ],
+    stream=True,
 )

-print(response)
 for chunk in response:
    print(chunk)
 ```

+### Image Input
+You can also pass images when the model supports it. Here is an example using [Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct) model through Sambanova.
+
+```python
+from litellm import completion
+
+# Set your Hugging Face Token
+os.environ["HF_TOKEN"] = "hf_xxxxxx"
+
+messages=[
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What's in this image?"},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+                    }
+                },
+            ],
+        }
+    ]
+
+response = completion(
+    model="huggingface/sambanova/meta-llama/Llama-3.2-11B-Vision-Instruct", 
+    messages=messages,
+)
+print(response.choices[0])
+```
+
+### Function Calling
+You can extend the model's capabilities by giving them access to tools. Here is an example with function calling using [Qwen2.5-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct) model through Sambanova.
+
+```python
+import os
+from litellm import completion
+
+# Set your Hugging Face Token
+os.environ["HF_TOKEN"] = "hf_xxxxxx"
+
+tools = [
+  {
+    "type": "function",
+    "function": {
+      "name": "get_current_weather",
+      "description": "Get the current weather in a given location",
+      "parameters": {
+        "type": "object",
+        "properties": {
+          "location": {
+            "type": "string",
+            "description": "The city and state, e.g. San Francisco, CA",
+          },
+          "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+        },
+        "required": ["location"],
+      },
+    }
+  }
+]
+messages = [
+    {
+        "role": "user",
+        "content": "What's the weather like in Boston today?",
+    }
+]
+
+response = completion(
+    model="huggingface/sambanova/meta-llama/Llama-3.3-70B-Instruct", 
+    messages=messages,
+    tools=tools,
+    tool_choice="auto"
+)
+print(response)
+```
+
+</TabItem>
+
+<TabItem value="endpoints" label="Inference Endpoints">
+
+<a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/LiteLLM_HuggingFace.ipynb">
+  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
+</a>
+
+### Basic Completion
+After you have [deployed your Hugging Face Inference Endpoint](https://endpoints.huggingface.co/new) on dedicated infrastructure, you can run inference on it by providing the endpoint base URL in `api_base`, and indicating `huggingface/tgi` as the model name.
+
+```python
+import os
+from litellm import completion
+
+os.environ["HF_TOKEN"] = "hf_xxxxxx"
+
+response = completion(
+    model="huggingface/tgi",
+    messages=[{"content": "Hello, how are you?", "role": "user"}],
+    api_base="https://my-endpoint.endpoints.huggingface.cloud/v1/"
+)
+print(response)
+```
+
+### Streaming
+
+```python
+import os
+from litellm import completion
+
+os.environ["HF_TOKEN"] = "hf_xxxxxx"
+
+response = completion(
+    model="huggingface/tgi",
+    messages=[{"content": "Hello, how are you?", "role": "user"}],
+    api_base="https://my-endpoint.endpoints.huggingface.cloud/v1/",
+    stream=True
+)
+
+for chunk in response:
+    print(chunk)
+```
+
+### Image Input
+
+```python
+import os
+from litellm import completion
+
+os.environ["HF_TOKEN"] = "hf_xxxxxx"
+
+messages=[
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What's in this image?"},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+                    }
+                },
+            ],
+        }
+    ]
+response = completion(
+    model="huggingface/tgi",
+    messages=messages,
+    api_base="https://my-endpoint.endpoints.huggingface.cloud/v1/""
+)
+print(response.choices[0])
+```
+
+### Function Calling
+
+```python
+import os
+from litellm import completion
+
+os.environ["HF_TOKEN"] = "hf_xxxxxx"
+
+functions = [{
+    "name": "get_weather",
+    "description": "Get the weather in a given location",
+    "parameters": {
+        "type": "object",
+        "properties": {
+            "location": {
+                "type": "string",
+                "description": "The location to get weather for"
+            }
+        },
+        "required": ["location"]
+    }
+}]
+
+response = completion(
+    model="huggingface/tgi",
+    messages=[{"content": "What's the weather like in San Francisco?", "role": "user"}],
+    api_base="https://my-endpoint.endpoints.huggingface.cloud/v1/",
+    functions=functions
+)
+print(response)
+```
+
+</TabItem>
+</Tabs>
+
+## LiteLLM Proxy Server with Hugging Face models
+You can set up a [LiteLLM Proxy Server](https://docs.litellm.ai/#litellm-proxy-server-llm-gateway) to serve Hugging Face models through any of the supported Inference Providers. Here's how to do it:
+
+### Step 1. Setup the config file
+
+In this case, we are configuring a proxy to serve `DeepSeek R1` from Hugging Face, using Together AI as the backend Inference Provider.
+
+```yaml
+model_list:
+  - model_name: my-r1-model
+    litellm_params:
+      model: huggingface/together/deepseek-ai/DeepSeek-R1
+      api_key: os.environ/HF_TOKEN # ensure you have `HF_TOKEN` in your .env
+```
+
+### Step 2. Start the server
+```bash
+litellm --config /path/to/config.yaml
+```
+
+### Step 3. Make a request to the server
+<Tabs>
+<TabItem value="curl" label="curl">
+
+```shell
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+    --header 'Content-Type: application/json' \
+    --data '{
+    "model": "my-r1-model",
+    "messages": [
+        {
+            "role": "user",
+            "content": "Hello, how are you?"
+        }
+    ]
+}'
+```
+
+</TabItem>
+<TabItem value="python" label="python">
+
+```python
+# pip install openai
+from openai import OpenAI
+
+client = OpenAI(
+    base_url="http://0.0.0.0:4000",
+    api_key="anything",
+)
+
+response = client.chat.completions.create(
+    model="my-r1-model",
+    messages=[
+        {"role": "user", "content": "Hello, how are you?"}
+    ]
+)
+print(response)
+```
+
+</TabItem>
+</Tabs>
+
+
 ## Embedding

-LiteLLM supports Hugging Face's [text-embedding-inference](https://github.com/huggingface/text-embeddings-inference) format.
+LiteLLM supports Hugging Face's [text-embedding-inference](https://github.com/huggingface/text-embeddings-inference) models as well.

 ```python
 from litellm import embedding
 import os
-os.environ['HUGGINGFACE_API_KEY'] = ""
+os.environ['HF_TOKEN'] = "hf_xxxxxx"
 response = embedding(
    model='huggingface/microsoft/codebert-base',
    input=["good morning from litellm"]
 )
 ```

-## Advanced
-
-### Setting API KEYS + API BASE
-
-If required, you can set the api key + api base, set it in your os environment. [Code for how it's sent](https://github.com/BerriAI/litellm/blob/0100ab2382a0e720c7978fbf662cc6e6920e7e03/litellm/llms/huggingface_restapi.py#L25)
-
-```python
-import os
-os.environ["HUGGINGFACE_API_KEY"] = ""
-os.environ["HUGGINGFACE_API_BASE"] = ""
-```
-
-### Viewing Log probs
-
-#### Using `decoder_input_details` - OpenAI `echo`
-
-The `echo` param is supported by OpenAI Completions - Use `litellm.text_completion()` for this
-
-```python
-from litellm import text_completion
-response = text_completion(
-    model="huggingface/bigcode/starcoder",
-    prompt="good morning",
-    max_tokens=10, logprobs=10,
-    echo=True
-)
-```
-
-#### Output
-
-```json
-{
-  "id": "chatcmpl-3fc71792-c442-4ba1-a611-19dd0ac371ad",
-  "object": "text_completion",
-  "created": 1698801125.936519,
-  "model": "bigcode/starcoder",
-  "choices": [
-    {
-      "text": ", I'm going to make you a sand",
-      "index": 0,
-      "logprobs": {
-        "tokens": [
-          "good",
-          " morning",
-          ",",
-          " I",
-          "'m",
-          " going",
-          " to",
-          " make",
-          " you",
-          " a",
-          " s",
-          "and"
-        ],
-        "token_logprobs": [
-          "None",
-          -14.96875,
-          -2.2285156,
-          -2.734375,
-          -2.0957031,
-          -2.0917969,
-          -0.09429932,
-          -3.1132812,
-          -1.3203125,
-          -1.2304688,
-          -1.6201172,
-          -0.010292053
-        ]
-      },
-      "finish_reason": "length"
-    }
-  ],
-  "usage": {
-    "completion_tokens": 9,
-    "prompt_tokens": 2,
-    "total_tokens": 11
-  }
-}
-```
-
-### Models with Prompt Formatting
-
-For models with special prompt templates (e.g. Llama2), we format the prompt to fit their template.
-
-#### Models with natively Supported Prompt Templates
-
-| Model Name                           | Works for Models                   | Function Call                                                                                                           | Required OS Variables               |
-| ------------------------------------ | ---------------------------------- | ----------------------------------------------------------------------------------------------------------------------- | ----------------------------------- |
-| mistralai/Mistral-7B-Instruct-v0.1   | mistralai/Mistral-7B-Instruct-v0.1 | `completion(model='huggingface/mistralai/Mistral-7B-Instruct-v0.1', messages=messages, api_base="your_api_endpoint")`   | `os.environ['HUGGINGFACE_API_KEY']` |
-| meta-llama/Llama-2-7b-chat           | All meta-llama llama2 chat models  | `completion(model='huggingface/meta-llama/Llama-2-7b', messages=messages, api_base="your_api_endpoint")`                | `os.environ['HUGGINGFACE_API_KEY']` |
-| tiiuae/falcon-7b-instruct            | All falcon instruct models         | `completion(model='huggingface/tiiuae/falcon-7b-instruct', messages=messages, api_base="your_api_endpoint")`            | `os.environ['HUGGINGFACE_API_KEY']` |
-| mosaicml/mpt-7b-chat                 | All mpt chat models                | `completion(model='huggingface/mosaicml/mpt-7b-chat', messages=messages, api_base="your_api_endpoint")`                 | `os.environ['HUGGINGFACE_API_KEY']` |
-| codellama/CodeLlama-34b-Instruct-hf  | All codellama instruct models      | `completion(model='huggingface/codellama/CodeLlama-34b-Instruct-hf', messages=messages, api_base="your_api_endpoint")`  | `os.environ['HUGGINGFACE_API_KEY']` |
-| WizardLM/WizardCoder-Python-34B-V1.0 | All wizardcoder models             | `completion(model='huggingface/WizardLM/WizardCoder-Python-34B-V1.0', messages=messages, api_base="your_api_endpoint")` | `os.environ['HUGGINGFACE_API_KEY']` |
-| Phind/Phind-CodeLlama-34B-v2         | All phind-codellama models         | `completion(model='huggingface/Phind/Phind-CodeLlama-34B-v2', messages=messages, api_base="your_api_endpoint")`         | `os.environ['HUGGINGFACE_API_KEY']` |
-
-**What if we don't support a model you need?**
-You can also specify you're own custom prompt formatting, in case we don't have your model covered yet.
-
-**Does this mean you have to specify a prompt for all models?**
-No. By default we'll concatenate your message content to make a prompt.
-
-**Default Prompt Template**
-
-```python
-def default_pt(messages):
-    return " ".join(message["content"] for message in messages)
-```
-
-[Code for how prompt formats work in LiteLLM](https://github.com/BerriAI/litellm/blob/main/litellm/llms/prompt_templates/factory.py)
-
-#### Custom prompt templates
-
-```python
-import litellm
-
-# Create your own custom prompt template works
-litellm.register_prompt_template(
-	    model="togethercomputer/LLaMA-2-7B-32K",
-	    roles={
-            "system": {
-                "pre_message": "[INST] <<SYS>>\n",
-                "post_message": "\n<</SYS>>\n [/INST]\n"
-            },
-            "user": {
-                "pre_message": "[INST] ",
-                "post_message": " [/INST]\n"
-            },
-            "assistant": {
-                "post_message": "\n"
-            }
-        }
-    )
-
-def test_huggingface_custom_model():
-    model = "huggingface/togethercomputer/LLaMA-2-7B-32K"
-    response = completion(model=model, messages=messages, api_base="https://ecd4sb5n09bo4ei2.us-east-1.aws.endpoints.huggingface.cloud")
-    print(response['choices'][0]['message']['content'])
-    return response
-
-test_huggingface_custom_model()
-```
-
-[Implementation Code](https://github.com/BerriAI/litellm/blob/c0b3da2c14c791a0b755f0b1e5a9ef065951ecbf/litellm/llms/huggingface_restapi.py#L52)
-
-### Deploying a model on huggingface
-
-You can use any chat/text model from Hugging Face with the following steps:
-
- Copy your model id/url from Huggingface Inference Endpoints
-  - [ ] Go to https://ui.endpoints.huggingface.co/
-  - [ ] Copy the url of the specific model you'd like to use
-        <Image img={require('../../img/hf_inference_endpoint.png')} alt="HF_Dashboard" style={{ maxWidth: '50%', height: 'auto' }}/>
- Set it as your model name
- Set your HUGGINGFACE_API_KEY as an environment variable
-
-Need help deploying a model on huggingface? [Check out this guide.](https://huggingface.co/docs/inference-endpoints/guides/create_endpoint)
-
-# output
-
-Same as the OpenAI format, but also includes logprobs. [See the code](https://github.com/BerriAI/litellm/blob/b4b2dbf005142e0a483d46a07a88a19814899403/litellm/llms/huggingface_restapi.py#L115)
-
-```json
-{
-  "choices": [
-    {
-      "finish_reason": "stop",
-      "index": 0,
-      "message": {
-        "content": "\ud83d\ude31\n\nComment: @SarahSzabo I'm",
-        "role": "assistant",
-        "logprobs": -22.697942825499993
-      }
-    }
-  ],
-  "created": 1693436637.38206,
-  "model": "https://ji16r2iys9a8rjk2.us-east-1.aws.endpoints.huggingface.cloud",
-  "usage": {
-    "prompt_tokens": 14,
-    "completion_tokens": 11,
-    "total_tokens": 25
-  }
-}
-```
-
 # FAQ

-**Does this support stop sequences?**
+**How does billing work with Hugging Face Inference Providers?**

-Yes, we support stop sequences - and you can pass as many as allowed by Hugging Face (or any provider!)
+> Billing is centralized on your Hugging Face account, no matter which providers you are using. You are billed the standard provider API rates with no additional markup - Hugging Face simply passes through the provider costs. Note that [Hugging Face PRO](https://huggingface.co/subscribe/pro) users get $2 worth of Inference credits every month that can be used across providers.

-**How do you deal with repetition penalty?**
+**Do I need to create an account for each Inference Provider?**

-We map the presence penalty parameter in openai to the repetition penalty parameter on Hugging Face. [See code](https://github.com/BerriAI/litellm/blob/b4b2dbf005142e0a483d46a07a88a19814899403/litellm/utils.py#L757).
+> No, you don't need to create separate accounts. All requests are routed through Hugging Face, so you only need your HF token. This allows you to easily benchmark different providers and choose the one that best fits your needs.
+
+**Will more inference providers be supported by Hugging Face in the future?**
+
+> Yes! New inference providers (and models) are being added gradually.

 We welcome any suggestions for improving our Hugging Face integration - Create an [issue](https://github.com/BerriAI/litellm/issues/new/choose)/[Join the Discord](https://discord.com/invite/wuPM9dRgDw)!
--- a/docs/my-website/docs/providers/infinity.md
+++ b/docs/my-website/docs/providers/infinity.md
@ -4,17 +4,16 @@ import TabItem from '@theme/TabItem';
 # Infinity

 | Property                  | Details                                                                                                    |
-|-------|-------|
+| ------------------------- | ---------------------------------------------------------------------------------------------------------- |
 | Description               | Infinity is a high-throughput, low-latency REST API for serving text-embeddings, reranking models and clip |
 | Provider Route on LiteLLM | `infinity/`                                                                                                |
-| Supported Operations | `/rerank` |
+| Supported Operations      | `/rerank`, `/embeddings`                                                                                   |
 | Link to Provider Doc      | [Infinity ↗](https://github.com/michaelfeil/infinity)                                                      |

-
 ## **Usage - LiteLLM Python SDK**

 ```python
-from litellm import rerank
+from litellm import rerank, embedding
 import os

 os.environ["INFINITY_API_BASE"] = "http://localhost:8080"
@ -39,8 +38,8 @@ model_list:
  - model_name: custom-infinity-rerank
    litellm_params:
      model: infinity/rerank
-      api_key: os.environ/INFINITY_API_KEY
      api_base: https://localhost:8080
+      api_key: os.environ/INFINITY_API_KEY
 ```

 Start litellm
@ -51,7 +50,9 @@ litellm --config /path/to/config.yaml
 # RUNNING on http://0.0.0.0:4000
 ```

-Test request
+## Test request:
+
+### Rerank

 ```bash
 curl http://0.0.0.0:4000/rerank \
@ -70,11 +71,10 @@ curl http://0.0.0.0:4000/rerank \
  }'
 ```

-
-## Supported Cohere Rerank API Params
+#### Supported Cohere Rerank API Params

 | Param              | Type        | Description                                     |
-|-------|-------|-------|
+| ------------------ | ----------- | ----------------------------------------------- |
 | `query`            | `str`       | The query to rerank the documents against       |
 | `documents`        | `list[str]` | The documents to rerank                         |
 | `top_n`            | `int`       | The number of documents to return               |
@ -138,6 +138,7 @@ response = rerank(
    raw_scores=True, # 👈 PROVIDER-SPECIFIC PARAM
 )
 ```
+
 </TabItem>

 <TabItem value="proxy" label="PROXY">
@ -179,6 +180,121 @@ curl http://0.0.0.0:4000/rerank \
    "raw_scores": True # 👈 PROVIDER-SPECIFIC PARAM
  }'
 ```
+
 </TabItem>

 </Tabs>
+
+## Embeddings
+
+LiteLLM provides an OpenAI api compatible `/embeddings` endpoint for embedding calls.
+
+**Setup**
+
+Add this to your litellm proxy config.yaml
+
+```yaml
+model_list:
+  - model_name: custom-infinity-embedding
+    litellm_params:
+      model: infinity/provider/custom-embedding-v1
+      api_base: http://localhost:8080
+      api_key: os.environ/INFINITY_API_KEY
+```
+
+### Test request:
+
+```bash
+curl http://0.0.0.0:4000/embeddings \
+  -H "Authorization: Bearer sk-1234" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "custom-infinity-embedding",
+    "input": ["hello"]
+  }'
+```
+
+#### Supported Embedding API Params
+
+| Param             | Type        | Description                                                 |
+| ----------------- | ----------- | ----------------------------------------------------------- |
+| `model`           | `str`       | The embedding model to use                                  |
+| `input`           | `list[str]` | The text inputs to generate embeddings for                  |
+| `encoding_format` | `str`       | The format to return embeddings in (e.g. "float", "base64") |
+| `modality`        | `str`       | The type of input (e.g. "text", "image", "audio")           |
+
+### Usage - Basic Examples
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import embedding
+import os
+
+os.environ["INFINITY_API_BASE"] = "http://localhost:8080"
+
+response = embedding(
+    model="infinity/bge-small",
+    input=["good morning from litellm"]
+)
+
+print(response.data[0]['embedding'])
+```
+
+</TabItem>
+
+<TabItem value="proxy" label="PROXY">
+
+```bash
+curl http://0.0.0.0:4000/embeddings \
+  -H "Authorization: Bearer sk-1234" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "custom-infinity-embedding",
+    "input": ["hello"]
+  }'
+```
+
+</TabItem>
+</Tabs>
+
+### Usage - OpenAI Client
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from openai import OpenAI
+
+client = OpenAI(
+  api_key="<LITELLM_MASTER_KEY>",
+  base_url="<LITELLM_URL>"
+)
+
+response = client.embeddings.create(
+  model="bge-small",
+  input=["The food was delicious and the waiter..."],
+  encoding_format="float"
+)
+
+print(response.data[0].embedding)
+```
+
+</TabItem>
+
+<TabItem value="proxy" label="PROXY">
+
+```bash
+curl http://0.0.0.0:4000/embeddings \
+  -H "Authorization: Bearer sk-1234" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "bge-small",
+    "input": ["The food was delicious and the waiter..."],
+    "encoding_format": "float"
+  }'
+```
+
+</TabItem>
+</Tabs>
--- a/docs/my-website/docs/providers/openai.md
+++ b/docs/my-website/docs/providers/openai.md
@ -163,6 +163,12 @@ os.environ["OPENAI_API_BASE"] = "openaiai-api-base"     # OPTIONAL

 | Model Name            | Function Call                                                   |
 |-----------------------|-----------------------------------------------------------------|
+| gpt-4.1 | `response = completion(model="gpt-4.1", messages=messages)` |
+| gpt-4.1-mini | `response = completion(model="gpt-4.1-mini", messages=messages)` |
+| gpt-4.1-nano | `response = completion(model="gpt-4.1-nano", messages=messages)` |
+| o4-mini | `response = completion(model="o4-mini", messages=messages)` |
+| o3-mini | `response = completion(model="o3-mini", messages=messages)` |
+| o3 | `response = completion(model="o3", messages=messages)` |
 | o1-mini | `response = completion(model="o1-mini", messages=messages)` |
 | o1-preview | `response = completion(model="o1-preview", messages=messages)` |
 | gpt-4o-mini  | `response = completion(model="gpt-4o-mini", messages=messages)` |
--- a/docs/my-website/docs/providers/vertex.md
+++ b/docs/my-website/docs/providers/vertex.md
@ -347,7 +347,7 @@ Return a `list[Recipe]`
 completion(model="vertex_ai/gemini-1.5-flash-preview-0514", messages=messages, response_format={ "type": "json_object" })
 ```

-### **Grounding**
+### **Grounding - Web Search**

 Add Google Search Result grounding to vertex ai calls. 

@ -358,13 +358,13 @@ See the grounding metadata with `response_obj._hidden_params["vertex_ai_groundin
 <Tabs>
 <TabItem value="sdk" label="SDK">

-```python 
+```python showLineNumbers
 from litellm import completion 

 ## SETUP ENVIRONMENT
 # !gcloud auth application-default login - run this to add vertex credentials to your env

-tools = [{"googleSearchRetrieval": {}}] # 👈 ADD GOOGLE SEARCH
+tools = [{"googleSearch": {}}] # 👈 ADD GOOGLE SEARCH

 resp = litellm.completion(
                    model="vertex_ai/gemini-1.0-pro-001",
@ -377,27 +377,121 @@ print(resp)
 </TabItem>
 <TabItem value="proxy" label="PROXY">

-```bash
+<Tabs>
+<TabItem value="openai" label="OpenAI Python SDK">
+
+```python showLineNumbers
+from openai import OpenAI
+
+client = OpenAI(
+    api_key="sk-1234", # pass litellm proxy key, if you're using virtual keys
+    base_url="http://0.0.0.0:4000/v1/" # point to litellm proxy
+)
+
+response = client.chat.completions.create(
+    model="gemini-pro",
+    messages=[{"role": "user", "content": "Who won the world cup?"}],
+    tools=[{"googleSearch": {}}],
+)
+
+print(response)
+```
+</TabItem>
+<TabItem value="curl" label="cURL">
+
+```bash showLineNumbers
 curl http://localhost:4000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer sk-1234" \
  -d '{
    "model": "gemini-pro",
    "messages": [
-      {"role": "user", "content": "Hello, Claude!"}
+      {"role": "user", "content": "Who won the world cup?"}
    ],
   "tools": [
        {
-            "googleSearchRetrieval": {} 
+            "googleSearch": {} 
        }
    ]
  }'

 ```
+</TabItem>
+</Tabs>

 </TabItem>
 </Tabs>

+You can also use the `enterpriseWebSearch` tool for an [enterprise compliant search](https://cloud.google.com/vertex-ai/generative-ai/docs/grounding/web-grounding-enterprise).
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python showLineNumbers
+from litellm import completion 
+
+## SETUP ENVIRONMENT
+# !gcloud auth application-default login - run this to add vertex credentials to your env
+
+tools = [{"enterpriseWebSearch": {}}] # 👈 ADD GOOGLE ENTERPRISE SEARCH
+
+resp = litellm.completion(
+                    model="vertex_ai/gemini-1.0-pro-001",
+                    messages=[{"role": "user", "content": "Who won the world cup?"}],
+                    tools=tools,
+                )
+
+print(resp)
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+<Tabs>
+<TabItem value="openai" label="OpenAI Python SDK">
+
+```python showLineNumbers
+from openai import OpenAI
+
+client = OpenAI(
+    api_key="sk-1234", # pass litellm proxy key, if you're using virtual keys
+    base_url="http://0.0.0.0:4000/v1/" # point to litellm proxy
+)
+
+response = client.chat.completions.create(
+    model="gemini-pro",
+    messages=[{"role": "user", "content": "Who won the world cup?"}],
+    tools=[{"enterpriseWebSearch": {}}],
+)
+
+print(response)
+```
+</TabItem>
+<TabItem value="curl" label="cURL">
+
+```bash showLineNumbers
+curl http://localhost:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer sk-1234" \
+  -d '{
+    "model": "gemini-pro",
+    "messages": [
+      {"role": "user", "content": "Who won the world cup?"}
+    ],
+   "tools": [
+        {
+            "enterpriseWebSearch": {} 
+        }
+    ]
+  }'
+
+```
+</TabItem>
+</Tabs>
+
+</TabItem>
+</Tabs>
+
+
 #### **Moving from Vertex AI SDK to LiteLLM (GROUNDING)**


@ -435,7 +529,7 @@ from litellm import completion

 # !gcloud auth application-default login - run this to add vertex credentials to your env

-tools = [{"googleSearchRetrieval": {"disable_attributon": False}}] # 👈 ADD GOOGLE SEARCH
+tools = [{"googleSearch": {"disable_attributon": False}}] # 👈 ADD GOOGLE SEARCH

 resp = litellm.completion(
                    model="vertex_ai/gemini-1.0-pro-001",
@ -448,9 +542,157 @@ print(resp)
 ```


+### **Thinking / `reasoning_content`**
+
+LiteLLM translates OpenAI's `reasoning_effort` to Gemini's `thinking` parameter. [Code](https://github.com/BerriAI/litellm/blob/620664921902d7a9bfb29897a7b27c1a7ef4ddfb/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py#L362)
+
+**Mapping**
+
+| reasoning_effort | thinking |
+| ---------------- | -------- |
+| "low"            | "budget_tokens": 1024 |
+| "medium"         | "budget_tokens": 2048 |
+| "high"           | "budget_tokens": 4096 |
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+
+# !gcloud auth application-default login - run this to add vertex credentials to your env
+
+resp = completion(
+    model="vertex_ai/gemini-2.5-flash-preview-04-17",
+    messages=[{"role": "user", "content": "What is the capital of France?"}],
+    reasoning_effort="low",
+    vertex_project="project-id",
+    vertex_location="us-central1"
+)
+
+```
+
+</TabItem>
+
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+- model_name: gemini-2.5-flash
+  litellm_params:
+    model: vertex_ai/gemini-2.5-flash-preview-04-17
+    vertex_credentials: {"project_id": "project-id", "location": "us-central1", "project_key": "project-key"}
+    vertex_project: "project-id"
+    vertex_location: "us-central1"
+```
+
+2. Start proxy
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```bash
+curl http://0.0.0.0:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer <YOUR-LITELLM-KEY>" \
+  -d '{
+    "model": "gemini-2.5-flash",
+    "messages": [{"role": "user", "content": "What is the capital of France?"}],
+    "reasoning_effort": "low"
+  }'
+```
+
+</TabItem>
+</Tabs>
+
+
+**Expected Response**
+
+```python
+ModelResponse(
+    id='chatcmpl-c542d76d-f675-4e87-8e5f-05855f5d0f5e',
+    created=1740470510,
+    model='claude-3-7-sonnet-20250219',
+    object='chat.completion',
+    system_fingerprint=None,
+    choices=[
+        Choices(
+            finish_reason='stop',
+            index=0,
+            message=Message(
+                content="The capital of France is Paris.",
+                role='assistant',
+                tool_calls=None,
+                function_call=None,
+                reasoning_content='The capital of France is Paris. This is a very straightforward factual question.'
+            ),
+        )
+    ],
+    usage=Usage(
+        completion_tokens=68,
+        prompt_tokens=42,
+        total_tokens=110,
+        completion_tokens_details=None,
+        prompt_tokens_details=PromptTokensDetailsWrapper(
+            audio_tokens=None,
+            cached_tokens=0,
+            text_tokens=None,
+            image_tokens=None
+        ),
+        cache_creation_input_tokens=0,
+        cache_read_input_tokens=0
+    )
+)
+```
+
+#### Pass `thinking` to Gemini models
+
+You can also pass the `thinking` parameter to Gemini models.
+
+This is translated to Gemini's [`thinkingConfig` parameter](https://ai.google.dev/gemini-api/docs/thinking#set-budget).
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+
+# !gcloud auth application-default login - run this to add vertex credentials to your env
+
+response = litellm.completion(
+  model="vertex_ai/gemini-2.5-flash-preview-04-17",
+  messages=[{"role": "user", "content": "What is the capital of France?"}],
+  thinking={"type": "enabled", "budget_tokens": 1024},
+  vertex_project="project-id",
+  vertex_location="us-central1"
+)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+```bash
+curl http://0.0.0.0:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $LITELLM_KEY" \
+  -d '{
+    "model": "vertex_ai/gemini-2.5-flash-preview-04-17",
+    "messages": [{"role": "user", "content": "What is the capital of France?"}],
+    "thinking": {"type": "enabled", "budget_tokens": 1024}
+  }'
+```
+
+</TabItem>
+</Tabs>
+
+
 ### **Context Caching**

-Use Vertex AI context caching is supported by calling provider api directly. (Unified Endpoint support comin soon.).
+Use Vertex AI context caching is supported by calling provider api directly. (Unified Endpoint support coming soon.).

 [**Go straight to provider**](../pass_through/vertex_ai.md#context-caching)

@ -668,7 +910,7 @@ export VERTEXAI_PROJECT="my-test-project" # ONLY use if model project is differe


 ## Specifying Safety Settings 
-In certain use-cases you may need to make calls to the models and pass [safety settigns](https://ai.google.dev/docs/safety_setting_gemini) different from the defaults. To do so, simple pass the `safety_settings` argument to `completion` or `acompletion`. For example:
+In certain use-cases you may need to make calls to the models and pass [safety settings](https://ai.google.dev/docs/safety_setting_gemini) different from the defaults. To do so, simple pass the `safety_settings` argument to `completion` or `acompletion`. For example:

 ### Set per model/request

@ -1808,7 +2050,7 @@ response = completion(
 print(response.choices[0])
 ```
 </TabItem>
-<TabItem value="proxy" lable="PROXY">
+<TabItem value="proxy" label="PROXY">

 1. Add model to config 

--- a/docs/my-website/docs/providers/vllm.md
+++ b/docs/my-website/docs/providers/vllm.md
@ -161,6 +161,120 @@ curl -L -X POST 'http://0.0.0.0:4000/embeddings' \

 Example Implementation from VLLM [here](https://github.com/vllm-project/vllm/pull/10020)

+<Tabs>
+<TabItem value="files_message" label="(Unified) Files Message">
+
+Use this to send a video url to VLLM + Gemini in the same format, using OpenAI's `files` message type.
+
+There are two ways to send a video url to VLLM:
+
+1. Pass the video url directly
+
+```
+{"type": "file", "file": {"file_id": video_url}},
+```
+
+2. Pass the video data as base64
+
+```
+{"type": "file", "file": {"file_data": f"data:video/mp4;base64,{video_data_base64}"}}
+```
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+
+messages=[
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "Summarize the following video"
+            },
+            {
+                "type": "file",
+                "file": {
+                    "file_id": "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
+                }
+            }
+        ]
+    }
+]
+
+# call vllm 
+os.environ["HOSTED_VLLM_API_BASE"] = "https://hosted-vllm-api.co"
+os.environ["HOSTED_VLLM_API_KEY"] = "" # [optional], if your VLLM server requires an API key
+response = completion(
+    model="hosted_vllm/qwen", # pass the vllm model name
+    messages=messages,
+)
+
+# call gemini 
+os.environ["GEMINI_API_KEY"] = "your-gemini-api-key"
+response = completion(
+    model="gemini/gemini-1.5-flash", # pass the gemini model name
+    messages=messages,
+)
+
+print(response)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+    - model_name: my-model
+      litellm_params:
+        model: hosted_vllm/qwen  # add hosted_vllm/ prefix to route as OpenAI provider
+        api_base: https://hosted-vllm-api.co      # add api base for OpenAI compatible provider
+    - model_name: my-gemini-model
+      litellm_params:
+        model: gemini/gemini-1.5-flash  # add gemini/ prefix to route as Google AI Studio provider
+        api_key: os.environ/GEMINI_API_KEY
+```
+
+2. Start the proxy 
+
+```bash
+$ litellm --config /path/to/config.yaml
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+3. Test it! 
+
+```bash
+curl -X POST http://0.0.0.0:4000/chat/completions \
+-H "Authorization: Bearer sk-1234" \
+-H "Content-Type: application/json" \
+-d '{
+    "model": "my-model",
+    "messages": [
+        {"role": "user", "content": 
+            [
+                {"type": "text", "text": "Summarize the following video"},
+                {"type": "file", "file": {"file_id": "https://www.youtube.com/watch?v=dQw4w9WgXcQ"}}
+            ]
+        }
+    ]
+}'
+```
+
+</TabItem>
+</Tabs>
+
+
+</TabItem>
+<TabItem value="video_url" label="(VLLM-specific) Video Message">
+
+Use this to send a video url to VLLM in it's native message format (`video_url`).
+
 There are two ways to send a video url to VLLM:

 1. Pass the video url directly
@ -249,6 +363,10 @@ curl -X POST http://0.0.0.0:4000/chat/completions \
 </Tabs>


+</TabItem>
+</Tabs>
+
+
 ## (Deprecated) for `vllm pip package` 
 ### Using - `litellm.completion`

--- a/docs/my-website/docs/providers/xai.md
+++ b/docs/my-website/docs/providers/xai.md
@ -18,13 +18,14 @@ os.environ['XAI_API_KEY']
 ```

 ## Sample Usage
-```python
+
+```python showLineNumbers title="LiteLLM python sdk usage - Non-streaming"
 from litellm import completion
 import os

 os.environ['XAI_API_KEY'] = ""
 response = completion(
-    model="xai/grok-2-latest",
+    model="xai/grok-3-mini-beta",
    messages=[
        {
            "role": "user",
@ -45,13 +46,14 @@ print(response)
 ```

 ## Sample Usage - Streaming
-```python
+
+```python showLineNumbers title="LiteLLM python sdk usage - Streaming"
 from litellm import completion
 import os

 os.environ['XAI_API_KEY'] = ""
 response = completion(
-    model="xai/grok-2-latest",
+    model="xai/grok-3-mini-beta",
    messages=[
        {
            "role": "user",
@ -75,7 +77,8 @@ for chunk in response:
 ```

 ## Sample Usage - Vision
-```python
+
+```python showLineNumbers title="LiteLLM python sdk usage - Vision"
 import os 
 from litellm import completion

@ -110,7 +113,7 @@ Here's how to call a XAI model with the LiteLLM Proxy Server

 1. Modify the config.yaml 

-  ```yaml
+  ```yaml showLineNumbers
  model_list:
    - model_name: my-model
      litellm_params:
@ -131,7 +134,7 @@ Here's how to call a XAI model with the LiteLLM Proxy Server

  <TabItem value="openai" label="OpenAI Python v1.0.0+">

-  ```python
+  ```python showLineNumbers
  import openai
  client = openai.OpenAI(
      api_key="sk-1234",             # pass litellm proxy key, if you're using virtual keys
@ -173,3 +176,81 @@ Here's how to call a XAI model with the LiteLLM Proxy Server
  </Tabs>


+## Reasoning Usage
+
+LiteLLM supports reasoning usage for xAI models.
+
+<Tabs>
+
+<TabItem value="python" label="LiteLLM Python SDK">
+
+```python showLineNumbers title="reasoning with xai/grok-3-mini-beta"
+import litellm
+response = litellm.completion(
+    model="xai/grok-3-mini-beta",
+    messages=[{"role": "user", "content": "What is 101*3?"}],
+    reasoning_effort="low",
+)
+
+print("Reasoning Content:")
+print(response.choices[0].message.reasoning_content)
+
+print("\nFinal Response:")
+print(completion.choices[0].message.content)
+
+print("\nNumber of completion tokens (input):")
+print(completion.usage.completion_tokens)
+
+print("\nNumber of reasoning tokens (input):")
+print(completion.usage.completion_tokens_details.reasoning_tokens)
+```
+</TabItem>
+
+<TabItem value="curl" label="LiteLLM Proxy - OpenAI SDK Usage">
+
+```python showLineNumbers title="reasoning with xai/grok-3-mini-beta"
+import openai
+client = openai.OpenAI(
+    api_key="sk-1234",             # pass litellm proxy key, if you're using virtual keys
+    base_url="http://0.0.0.0:4000" # litellm-proxy-base url
+)
+
+response = client.chat.completions.create(
+    model="xai/grok-3-mini-beta",
+    messages=[{"role": "user", "content": "What is 101*3?"}],
+    reasoning_effort="low",
+)
+
+print("Reasoning Content:")
+print(response.choices[0].message.reasoning_content)
+
+print("\nFinal Response:")
+print(completion.choices[0].message.content)
+
+print("\nNumber of completion tokens (input):")
+print(completion.usage.completion_tokens)
+
+print("\nNumber of reasoning tokens (input):")
+print(completion.usage.completion_tokens_details.reasoning_tokens)
+```
+
+</TabItem>
+</Tabs>
+
+**Example Response:**
+
+```shell
+Reasoning Content:
+Let me calculate 101 multiplied by 3:
+101 * 3 = 303.
+I can double-check that: 100 * 3 is 300, and 1 * 3 is 3, so 300 + 3 = 303. Yes, that's correct.
+
+Final Response:
+The result of 101 multiplied by 3 is 303.
+
+Number of completion tokens (input):
+14
+
+Number of reasoning tokens (input):
+310
+```
--- a/docs/my-website/docs/proxy/admin_ui_sso.md
+++ b/docs/my-website/docs/proxy/admin_ui_sso.md
@ -243,12 +243,12 @@ We allow you to pass a local image or a an http/https url of your image

 Set `UI_LOGO_PATH` on your env. We recommend using a hosted image, it's a lot easier to set up and configure / debug

-Exaple setting Hosted image
+Example setting Hosted image
 ```shell
 UI_LOGO_PATH="https://litellm-logo-aws-marketplace.s3.us-west-2.amazonaws.com/berriai-logo-github.png"
 ```

-Exaple setting a local image (on your container)
+Example setting a local image (on your container)
 ```shell
 UI_LOGO_PATH="ui_images/logo.jpg"
 ```
--- a/docs/my-website/docs/proxy/alerting.md
+++ b/docs/my-website/docs/proxy/alerting.md
@ -213,7 +213,7 @@ model_list:
 general_settings: 
  master_key: sk-1234
  alerting: ["slack"]
-  alerting_threshold: 0.0001 # (Seconds) set an artifically low threshold for testing alerting
+  alerting_threshold: 0.0001 # (Seconds) set an artificially low threshold for testing alerting
  alert_to_webhook_url: {
    "llm_exceptions": "https://hooks.slack.com/services/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",
    "llm_too_slow": "https://hooks.slack.com/services/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",
@ -247,7 +247,7 @@ model_list:
 general_settings: 
  master_key: sk-1234
  alerting: ["slack"]
-  alerting_threshold: 0.0001 # (Seconds) set an artifically low threshold for testing alerting
+  alerting_threshold: 0.0001 # (Seconds) set an artificially low threshold for testing alerting
  alert_to_webhook_url: {
    "llm_exceptions": ["os.environ/SLACK_WEBHOOK_URL", "os.environ/SLACK_WEBHOOK_URL_2"],
    "llm_too_slow": ["https://webhook.site/7843a980-a494-4967-80fb-d502dbc16886", "https://webhook.site/28cfb179-f4fb-4408-8129-729ff55cf213"],
@ -425,7 +425,7 @@ curl -X GET --location 'http://0.0.0.0:4000/health/services?service=webhook' \
 - `projected_exceeded_date` *str or null*: The date when the budget is projected to be exceeded, returned when 'soft_budget' is set for key (optional).
 - `projected_spend` *float or null*: The projected spend amount, returned when 'soft_budget' is set for key (optional).
 - `event` *Literal["budget_crossed", "threshold_crossed", "projected_limit_exceeded"]*: The type of event that triggered the webhook. Possible values are:
-    * "spend_tracked": Emitted whenver spend is tracked for a customer id. 
+    * "spend_tracked": Emitted whenever spend is tracked for a customer id. 
    * "budget_crossed": Indicates that the spend has exceeded the max budget.
    * "threshold_crossed": Indicates that spend has crossed a threshold (currently sent when 85% and 95% of budget is reached).
    * "projected_limit_exceeded": For "key" only - Indicates that the projected spend is expected to exceed the soft budget threshold.
@ -480,7 +480,7 @@ LLM-related Alerts
 | `cooldown_deployment` | Alerts when a deployment is put into cooldown | ✅ |
 | `new_model_added` | Notifications when a new model is added to litellm proxy through /model/new| ✅ |
 | `outage_alerts` | Alerts when a specific LLM deployment is facing an outage | ✅ |
-| `region_outage_alerts` | Alerts when a specfic LLM region is facing an outage. Example us-east-1 | ✅ |
+| `region_outage_alerts` | Alerts when a specific LLM region is facing an outage. Example us-east-1 | ✅ |

 Budget and Spend Alerts

--- a/docs/my-website/docs/proxy/config_settings.md
+++ b/docs/my-website/docs/proxy/config_settings.md
@ -299,6 +299,9 @@ router_settings:
 |------|-------------|
 | ACTIONS_ID_TOKEN_REQUEST_TOKEN | Token for requesting ID in GitHub Actions
 | ACTIONS_ID_TOKEN_REQUEST_URL | URL for requesting ID token in GitHub Actions
+| AGENTOPS_ENVIRONMENT | Environment for AgentOps logging integration
+| AGENTOPS_API_KEY | API Key for AgentOps logging integration
+| AGENTOPS_SERVICE_NAME | Service Name for AgentOps logging integration
 | AISPEND_ACCOUNT_ID | Account ID for AI Spend
 | AISPEND_API_KEY | API Key for AI Spend
 | ALLOWED_EMAIL_DOMAINS | List of email domains allowed for access
@ -323,6 +326,9 @@ router_settings:
 | AZURE_AUTHORITY_HOST | Azure authority host URL
 | AZURE_CLIENT_ID | Client ID for Azure services
 | AZURE_CLIENT_SECRET | Client secret for Azure services
+| AZURE_TENANT_ID | Tenant ID for Azure Active Directory
+| AZURE_USERNAME | Username for Azure services, use in conjunction with AZURE_PASSWORD for azure ad token with basic username/password workflow
+| AZURE_PASSWORD | Password for Azure services, use in conjunction with AZURE_USERNAME for azure ad token with basic username/password workflow
 | AZURE_FEDERATED_TOKEN_FILE | File path to Azure federated token
 | AZURE_KEY_VAULT_URI | URI for Azure Key Vault
 | AZURE_STORAGE_ACCOUNT_KEY | The Azure Storage Account Key to use for Authentication to Azure Blob Storage logging
@ -331,7 +337,6 @@ router_settings:
 | AZURE_STORAGE_TENANT_ID | The Application Tenant ID to use for Authentication to Azure Blob Storage logging
 | AZURE_STORAGE_CLIENT_ID | The Application Client ID to use for Authentication to Azure Blob Storage logging
 | AZURE_STORAGE_CLIENT_SECRET | The Application Client Secret to use for Authentication to Azure Blob Storage logging
-| AZURE_TENANT_ID | Tenant ID for Azure Active Directory
 | BERRISPEND_ACCOUNT_ID | Account ID for BerriSpend service
 | BRAINTRUST_API_KEY | API key for Braintrust integration
 | CIRCLE_OIDC_TOKEN | OpenID Connect token for CircleCI
@ -406,6 +411,7 @@ router_settings:
 | HELICONE_API_KEY | API key for Helicone service
 | HOSTNAME | Hostname for the server, this will be [emitted to `datadog` logs](https://docs.litellm.ai/docs/proxy/logging#datadog)
 | HUGGINGFACE_API_BASE | Base URL for Hugging Face API
+| HUGGINGFACE_API_KEY | API key for Hugging Face API
 | IAM_TOKEN_DB_AUTH | IAM token for database authentication
 | JSON_LOGS | Enable JSON formatted logging
 | JWT_AUDIENCE | Expected audience for JWT tokens
@ -432,6 +438,7 @@ router_settings:
 | LITERAL_BATCH_SIZE | Batch size for Literal operations
 | LITELLM_DONT_SHOW_FEEDBACK_BOX | Flag to hide feedback box in LiteLLM UI
 | LITELLM_DROP_PARAMS | Parameters to drop in LiteLLM requests
+| LITELLM_MODIFY_PARAMS | Parameters to modify in LiteLLM requests
 | LITELLM_EMAIL | Email associated with LiteLLM account
 | LITELLM_GLOBAL_MAX_PARALLEL_REQUEST_RETRIES | Maximum retries for parallel requests in LiteLLM
 | LITELLM_GLOBAL_MAX_PARALLEL_REQUEST_RETRY_TIMEOUT | Timeout for retries of parallel requests in LiteLLM
@ -445,9 +452,12 @@ router_settings:
 | LITELLM_TOKEN | Access token for LiteLLM integration
 | LITELLM_PRINT_STANDARD_LOGGING_PAYLOAD | If true, prints the standard logging payload to the console - useful for debugging
 | LOGFIRE_TOKEN | Token for Logfire logging service
+| MISTRAL_API_BASE | Base URL for Mistral API
+| MISTRAL_API_KEY | API key for Mistral API
 | MICROSOFT_CLIENT_ID | Client ID for Microsoft services
 | MICROSOFT_CLIENT_SECRET | Client secret for Microsoft services
 | MICROSOFT_TENANT | Tenant ID for Microsoft Azure
+| MICROSOFT_SERVICE_PRINCIPAL_ID | Service Principal ID for Microsoft Enterprise Application. (This is an advanced feature if you want litellm to auto-assign members to Litellm Teams based on their Microsoft Entra ID Groups)
 | NO_DOCS | Flag to disable documentation generation
 | NO_PROXY | List of addresses to bypass proxy
 | OAUTH_TOKEN_INFO_ENDPOINT | Endpoint for OAuth token info retrieval
--- a/docs/my-website/docs/proxy/cost_tracking.md
+++ b/docs/my-website/docs/proxy/cost_tracking.md
@ -6,6 +6,8 @@ import Image from '@theme/IdealImage';

 Track spend for keys, users, and teams across 100+ LLMs.

+LiteLLM automatically tracks spend for all known models. See our [model cost map](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)
+
 ### How to Track Spend with LiteLLM

 **Step 1**
@ -35,10 +37,10 @@ response = client.chat.completions.create(
            "content": "this is a test request, write a short poem"
        }
    ],
-    user="palantir",
+    user="palantir", # OPTIONAL: pass user to track spend by user
    extra_body={ 
        "metadata": {
-            "tags": ["jobID:214590dsff09fds", "taskName:run_page_classification"]
+            "tags": ["jobID:214590dsff09fds", "taskName:run_page_classification"] # ENTERPRISE: pass tags to track spend by tags
        }
    }
 )
@ -63,9 +65,9 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
        "content": "what llm are you"
        }
    ],
-    "user": "palantir",
+    "user": "palantir", # OPTIONAL: pass user to track spend by user
    "metadata": {
-        "tags": ["jobID:214590dsff09fds", "taskName:run_page_classification"]
+        "tags": ["jobID:214590dsff09fds", "taskName:run_page_classification"] # ENTERPRISE: pass tags to track spend by tags
    }
 }'
 ```
@ -90,7 +92,7 @@ chat = ChatOpenAI(
    user="palantir",
    extra_body={
        "metadata": {
-            "tags": ["jobID:214590dsff09fds", "taskName:run_page_classification"]
+            "tags": ["jobID:214590dsff09fds", "taskName:run_page_classification"] # ENTERPRISE: pass tags to track spend by tags
        }
    }
 )
@ -150,8 +152,112 @@ Navigate to the Usage Tab on the LiteLLM UI (found on https://your-proxy-endpoin
 </TabItem>
 </Tabs>

-## ✨ (Enterprise) API Endpoints to get Spend
-### Getting Spend Reports - To Charge Other Teams, Customers, Users
+### Allowing Non-Proxy Admins to access `/spend` endpoints 
+
+Use this when you want non-proxy admins to access `/spend` endpoints
+
+:::info
+
+Schedule a [meeting with us to get your Enterprise License](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
+
+:::
+
+##### Create Key 
+Create Key with with `permissions={"get_spend_routes": true}` 
+```shell
+curl --location 'http://0.0.0.0:4000/key/generate' \
+        --header 'Authorization: Bearer sk-1234' \
+        --header 'Content-Type: application/json' \
+        --data '{
+            "permissions": {"get_spend_routes": true}
+    }'
+```
+
+##### Use generated key on `/spend` endpoints
+
+Access spend Routes with newly generate keys
+```shell
+curl -X GET 'http://localhost:4000/global/spend/report?start_date=2024-04-01&end_date=2024-06-30' \
+  -H 'Authorization: Bearer sk-H16BKvrSNConSsBYLGc_7A'
+```
+
+
+
+#### Reset Team, API Key Spend - MASTER KEY ONLY
+
+Use `/global/spend/reset` if you want to:
+- Reset the Spend for all API Keys, Teams. The `spend` for ALL Teams and Keys in `LiteLLM_TeamTable` and `LiteLLM_VerificationToken` will be set to `spend=0`
+
+- LiteLLM will maintain all the logs in `LiteLLMSpendLogs` for Auditing Purposes
+
+##### Request 
+Only the `LITELLM_MASTER_KEY` you set can access this route
+```shell
+curl -X POST \
+  'http://localhost:4000/global/spend/reset' \
+  -H 'Authorization: Bearer sk-1234' \
+  -H 'Content-Type: application/json'
+```
+
+##### Expected Responses
+
+```shell
+{"message":"Spend for all API Keys and Teams reset successfully","status":"success"}
+```
+
+## Daily Spend Breakdown API
+
+Retrieve granular daily usage data for a user (by model, provider, and API key) with a single endpoint.
+
+Example Request:
+
+```shell title="Daily Spend Breakdown API" showLineNumbers
+curl -L -X GET 'http://localhost:4000/user/daily/activity?start_date=2025-03-20&end_date=2025-03-27' \
+-H 'Authorization: Bearer sk-...'
+```
+
+```json title="Daily Spend Breakdown API Response" showLineNumbers
+{
+    "results": [
+        {
+            "date": "2025-03-27",
+            "metrics": {
+                "spend": 0.0177072,
+                "prompt_tokens": 111,
+                "completion_tokens": 1711,
+                "total_tokens": 1822,
+                "api_requests": 11
+            },
+            "breakdown": {
+                "models": {
+                    "gpt-4o-mini": {
+                        "spend": 1.095e-05,
+                        "prompt_tokens": 37,
+                        "completion_tokens": 9,
+                        "total_tokens": 46,
+                        "api_requests": 1
+                },
+                "providers": { "openai": { ... }, "azure_ai": { ... } },
+                "api_keys": { "3126b6eaf1...": { ... } }
+            }
+        }
+    ],
+    "metadata": {
+        "total_spend": 0.7274667,
+        "total_prompt_tokens": 280990,
+        "total_completion_tokens": 376674,
+        "total_api_requests": 14
+    }
+}
+```
+
+### API Reference
+
+See our [Swagger API](https://litellm-api.up.railway.app/#/Budget%20%26%20Spend%20Tracking/get_user_daily_activity_user_daily_activity_get) for more details on the `/user/daily/activity` endpoint
+
+## ✨ (Enterprise) Generate Spend Reports 
+
+Use this to charge other teams, customers, users

 Use the `/global/spend/report` endpoint to get spend reports

@ -470,105 +576,6 @@ curl -X GET 'http://localhost:4000/global/spend/report?start_date=2024-04-01&end

 </Tabs>

-### Allowing Non-Proxy Admins to access `/spend` endpoints 
-
-Use this when you want non-proxy admins to access `/spend` endpoints
-
-:::info
-
-Schedule a [meeting with us to get your Enterprise License](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
-
-:::
-
-##### Create Key 
-Create Key with with `permissions={"get_spend_routes": true}` 
-```shell
-curl --location 'http://0.0.0.0:4000/key/generate' \
-        --header 'Authorization: Bearer sk-1234' \
-        --header 'Content-Type: application/json' \
-        --data '{
-            "permissions": {"get_spend_routes": true}
-    }'
-```
-
-##### Use generated key on `/spend` endpoints
-
-Access spend Routes with newly generate keys
-```shell
-curl -X GET 'http://localhost:4000/global/spend/report?start_date=2024-04-01&end_date=2024-06-30' \
-  -H 'Authorization: Bearer sk-H16BKvrSNConSsBYLGc_7A'
-```
-
-
-
-#### Reset Team, API Key Spend - MASTER KEY ONLY
-
-Use `/global/spend/reset` if you want to:
- Reset the Spend for all API Keys, Teams. The `spend` for ALL Teams and Keys in `LiteLLM_TeamTable` and `LiteLLM_VerificationToken` will be set to `spend=0`
-
- LiteLLM will maintain all the logs in `LiteLLMSpendLogs` for Auditing Purposes
-
-##### Request 
-Only the `LITELLM_MASTER_KEY` you set can access this route
-```shell
-curl -X POST \
-  'http://localhost:4000/global/spend/reset' \
-  -H 'Authorization: Bearer sk-1234' \
-  -H 'Content-Type: application/json'
-```
-
-##### Expected Responses
-
-```shell
-{"message":"Spend for all API Keys and Teams reset successfully","status":"success"}
-```
-
-
-
-
-## Spend Tracking for Azure OpenAI Models
-
-Set base model for cost tracking azure image-gen call
-
-#### Image Generation 
-
-```yaml
-model_list: 
-  - model_name: dall-e-3
-    litellm_params:
-        model: azure/dall-e-3-test
-        api_version: 2023-06-01-preview
-        api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
-        api_key: os.environ/AZURE_API_KEY
-        base_model: dall-e-3 # 👈 set dall-e-3 as base model
-    model_info:
-        mode: image_generation
-```
-
-#### Chat Completions / Embeddings
-
-**Problem**: Azure returns `gpt-4` in the response when `azure/gpt-4-1106-preview` is used. This leads to inaccurate cost tracking
-
-**Solution** ✅ :  Set `base_model` on your config so litellm uses the correct model for calculating azure cost
-
-Get the base model name from [here](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)
-
-Example config with `base_model`
-```yaml
-model_list:
-  - model_name: azure-gpt-3.5
-    litellm_params:
-      model: azure/chatgpt-v-2
-      api_base: os.environ/AZURE_API_BASE
-      api_key: os.environ/AZURE_API_KEY
-      api_version: "2023-07-01-preview"
-    model_info:
-      base_model: azure/gpt-4-1106-preview
-```
-
-## Custom Input/Output Pricing
-
-👉 Head to [Custom Input/Output Pricing](https://docs.litellm.ai/docs/proxy/custom_pricing) to setup custom pricing or your models

 ## ✨ Custom Spend Log metadata

@ -588,3 +595,4 @@ Logging specific key,value pairs in spend logs metadata is an enterprise feature
 Tracking spend with Custom tags is an enterprise feature. [See here](./enterprise.md#tracking-spend-for-custom-tags)

 :::
+
--- a/docs/my-website/docs/proxy/custom_pricing.md
+++ b/docs/my-website/docs/proxy/custom_pricing.md
@ -26,10 +26,12 @@ model_list:
  - model_name: sagemaker-completion-model
    litellm_params:
      model: sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4
+    model_info:
      input_cost_per_second: 0.000420
  - model_name: sagemaker-embedding-model
    litellm_params:
      model: sagemaker/berri-benchmarking-gpt-j-6b-fp16
+    model_info:
      input_cost_per_second: 0.000420 
 ```

@ -54,12 +56,56 @@ model_list:
      model: azure/<your_deployment_name>
      api_key: os.environ/AZURE_API_KEY
      api_base: os.environ/AZURE_API_BASE
-      api_version: os.envrion/AZURE_API_VERSION
+      api_version: os.environ/AZURE_API_VERSION
+    model_info:
      input_cost_per_token: 0.000421 # 👈 ONLY to track cost per token
      output_cost_per_token: 0.000520 # 👈 ONLY to track cost per token
 ```

-### Debugging 
+## Override Model Cost Map
+
+You can override [our model cost map](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json) with your own custom pricing for a mapped model.
+
+Just add a `model_info` key to your model in the config, and override the desired keys.
+
+Example: Override Anthropic's model cost map for the `prod/claude-3-5-sonnet-20241022` model.
+
+```yaml
+model_list:
+  - model_name: "prod/claude-3-5-sonnet-20241022"
+    litellm_params:
+      model: "anthropic/claude-3-5-sonnet-20241022"
+      api_key: os.environ/ANTHROPIC_PROD_API_KEY
+    model_info:
+      input_cost_per_token: 0.000006
+      output_cost_per_token: 0.00003
+      cache_creation_input_token_cost: 0.0000075
+      cache_read_input_token_cost: 0.0000006
+```
+
+## Set 'base_model' for Cost Tracking (e.g. Azure deployments)
+
+**Problem**: Azure returns `gpt-4` in the response when `azure/gpt-4-1106-preview` is used. This leads to inaccurate cost tracking
+
+**Solution** ✅ :  Set `base_model` on your config so litellm uses the correct model for calculating azure cost
+
+Get the base model name from [here](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)
+
+Example config with `base_model`
+```yaml
+model_list:
+  - model_name: azure-gpt-3.5
+    litellm_params:
+      model: azure/chatgpt-v-2
+      api_base: os.environ/AZURE_API_BASE
+      api_key: os.environ/AZURE_API_KEY
+      api_version: "2023-07-01-preview"
+    model_info:
+      base_model: azure/gpt-4-1106-preview
+```
+
+
+## Debugging 

 If you're custom pricing is not being used or you're seeing errors, please check the following:

--- a/docs/my-website/docs/proxy/db_deadlocks.md
+++ b/docs/my-website/docs/proxy/db_deadlocks.md
@ -19,7 +19,7 @@ LiteLLM writes `UPDATE` and `UPSERT` queries to the DB. When using 10+ instances

 ### Stage 1. Each instance writes updates to redis

-Each instance will accumlate the spend updates for a key, user, team, etc and write the updates to a redis queue. 
+Each instance will accumulate the spend updates for a key, user, team, etc and write the updates to a redis queue. 

 <Image img={require('../../img/deadlock_fix_1.png')}  style={{ width: '900px', height: 'auto' }} />
 <p style={{textAlign: 'left', color: '#666'}}>
--- a/docs/my-website/docs/proxy/deploy.md
+++ b/docs/my-website/docs/proxy/deploy.md
@ -22,7 +22,7 @@ echo 'LITELLM_MASTER_KEY="sk-1234"' > .env

 # Add the litellm salt key - you cannot change this after adding a model
 # It is used to encrypt / decrypt your LLM API Key credentials
-# We recommned - https://1password.com/password-generator/ 
+# We recommend - https://1password.com/password-generator/ 
 # password generator to get a random hash for litellm salt key
 echo 'LITELLM_SALT_KEY="sk-1234"' >> .env

@ -125,7 +125,7 @@ CMD ["--port", "4000", "--config", "config.yaml", "--detailed_debug"]

 ### Build from litellm `pip` package

-Follow these instructons to build a docker container from the litellm pip package. If your company has a strict requirement around security / building images you can follow these steps.
+Follow these instructions to build a docker container from the litellm pip package. If your company has a strict requirement around security / building images you can follow these steps.

 Dockerfile 

@ -999,7 +999,7 @@ services:
      - "4000:4000" # Map the container port to the host, change the host port if necessary
    volumes:
      - ./litellm-config.yaml:/app/config.yaml # Mount the local configuration file
-    # You can change the port or number of workers as per your requirements or pass any new supported CLI augument. Make sure the port passed here matches with the container port defined above in `ports` value
+    # You can change the port or number of workers as per your requirements or pass any new supported CLI argument. Make sure the port passed here matches with the container port defined above in `ports` value
    command: [ "--config", "/app/config.yaml", "--port", "4000", "--num_workers", "8" ]

 # ...rest of your docker-compose config if any
--- a/docs/my-website/docs/proxy/enterprise.md
+++ b/docs/my-website/docs/proxy/enterprise.md
@ -691,7 +691,7 @@ curl --request POST \
 <TabItem value="admin_only_routes" label="Test `admin_only_routes`">


-**Successfull Request**
+**Successful Request**

 ```shell
 curl --location 'http://0.0.0.0:4000/key/generate' \
@ -729,7 +729,7 @@ curl --location 'http://0.0.0.0:4000/key/generate' \
 <TabItem value="allowed_routes" label="Test `allowed_routes`">


-**Successfull Request**
+**Successful Request**

 ```shell
 curl http://localhost:4000/chat/completions \
--- a/docs/my-website/docs/proxy/guardrails/aim_security.md
+++ b/docs/my-website/docs/proxy/guardrails/aim_security.md
@ -140,7 +140,7 @@ The above request should not be blocked, and you should receive a regular LLM re

 </Tabs>

-# Advanced
+## Advanced

 Aim Guard provides user-specific Guardrail policies, enabling you to apply tailored policies to individual users.
 To utilize this feature, include the end-user's email in the request payload by setting the `x-aim-user-email` header of your request.
--- a/docs/my-website/docs/proxy/guardrails/quick_start.md
+++ b/docs/my-website/docs/proxy/guardrails/quick_start.md
@ -164,7 +164,7 @@ curl -i http://localhost:4000/v1/chat/completions \

 **Expected response**

-Your response headers will incude `x-litellm-applied-guardrails` with the guardrail applied 
+Your response headers will include `x-litellm-applied-guardrails` with the guardrail applied 

 ```
 x-litellm-applied-guardrails: aporia-pre-guard
--- a/docs/my-website/docs/proxy/litellm_managed_files.md
+++ b/docs/my-website/docs/proxy/litellm_managed_files.md
@ -0,0 +1,279 @@
+import TabItem from '@theme/TabItem';
+import Tabs from '@theme/Tabs';
+import Image from '@theme/IdealImage';
+
+# [BETA] Unified File ID
+
+Reuse the same 'file id' across different providers.
+
+| Feature | Description | Comments |
+| --- | --- | --- |
+| Proxy | ✅ |  |
+| SDK | ❌ | Requires postgres DB for storing file ids |
+| Available across all providers | ✅ |  |
+
+
+
+Limitations of LiteLLM Managed Files:
+- Only works for `/chat/completions` requests. 
+- Assumes just 1 model configured per model_name.
+
+Follow [here](https://github.com/BerriAI/litellm/discussions/9632) for multiple models, batches support.
+
+### 1. Setup config.yaml
+
+```
+model_list:
+    - model_name: "gemini-2.0-flash"
+      litellm_params:
+        model: vertex_ai/gemini-2.0-flash
+        vertex_project: my-project-id
+        vertex_location: us-central1
+    - model_name: "gpt-4o-mini-openai"
+      litellm_params:
+        model: gpt-4o-mini
+        api_key: os.environ/OPENAI_API_KEY
+```
+
+### 2. Start proxy
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+### 3. Test it!
+
+Specify `target_model_names` to use the same file id across different providers. This is the list of model_names set via config.yaml (or 'public_model_names' on UI). 
+
+```python
+target_model_names="gpt-4o-mini-openai, gemini-2.0-flash" # 👈 Specify model_names
+```
+
+Check `/v1/models` to see the list of available model names for a key.
+
+#### **Store a PDF file**
+
+```python
+from openai import OpenAI
+
+client = OpenAI(base_url="http://0.0.0.0:4000", api_key="sk-1234", max_retries=0)
+
+
+# Download and save the PDF locally 
+url = (
+    "https://storage.googleapis.com/cloud-samples-data/generative-ai/pdf/2403.05530.pdf"
+)
+response = requests.get(url)
+response.raise_for_status()
+
+# Save the PDF locally
+with open("2403.05530.pdf", "wb") as f:
+    f.write(response.content)
+
+file = client.files.create(
+    file=open("2403.05530.pdf", "rb"),
+    purpose="user_data", # can be any openai 'purpose' value
+    extra_body={"target_model_names": "gpt-4o-mini-openai, gemini-2.0-flash"}, # 👈 Specify model_names
+)
+
+print(f"file id={file.id}")
+```
+
+#### **Use the same file id across different providers**
+
+<Tabs>
+<TabItem value="openai" label="OpenAI">
+
+```python
+completion = client.chat.completions.create(
+    model="gpt-4o-mini-openai",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What is in this recording?"},
+                {
+                    "type": "file",
+                    "file": {
+                        "file_id": file.id,
+                    },
+                },
+            ],
+        },
+    ]
+)
+
+print(completion.choices[0].message)
+```
+
+
+</TabItem>
+<TabItem value="vertex" label="Vertex AI">
+
+```python
+completion = client.chat.completions.create(
+    model="gemini-2.0-flash",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What is in this recording?"},
+                {
+                    "type": "file",
+                    "file": {
+                        "file_id": file.id,
+                    },
+                },
+            ],
+        },
+    ]
+)
+
+print(completion.choices[0].message)
+
+```
+
+</TabItem>
+</Tabs>
+
+### Complete Example
+
+```python   
+import base64
+import requests
+from openai import OpenAI
+
+client = OpenAI(base_url="http://0.0.0.0:4000", api_key="sk-1234", max_retries=0)
+
+
+# Download and save the PDF locally
+url = (
+    "https://storage.googleapis.com/cloud-samples-data/generative-ai/pdf/2403.05530.pdf"
+)
+response = requests.get(url)
+response.raise_for_status()
+
+# Save the PDF locally
+with open("2403.05530.pdf", "wb") as f:
+    f.write(response.content)
+
+# Read the local PDF file
+file = client.files.create(
+    file=open("2403.05530.pdf", "rb"),
+    purpose="user_data", # can be any openai 'purpose' value
+    extra_body={"target_model_names": "gpt-4o-mini-openai, vertex_ai/gemini-2.0-flash"},
+)
+
+print(f"file.id: {file.id}") # 👈 Unified file id
+
+## GEMINI CALL ### 
+completion = client.chat.completions.create(
+    model="gemini-2.0-flash",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What is in this recording?"},
+                {
+                    "type": "file",
+                    "file": {
+                        "file_id": file.id,
+                    },
+                },
+            ],
+        },
+    ]
+)
+
+print(completion.choices[0].message)
+
+
+### OPENAI CALL ### 
+completion = client.chat.completions.create(
+    model="gpt-4o-mini-openai",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What is in this recording?"},
+                {
+                    "type": "file",
+                    "file": {
+                        "file_id": file.id,
+                    },
+                },
+            ],
+        },
+    ],
+)
+
+print(completion.choices[0].message)
+
+```
+
+
+### Supported Endpoints
+
+#### Create a file - `/files`
+
+```python
+from openai import OpenAI
+
+client = OpenAI(base_url="http://0.0.0.0:4000", api_key="sk-1234", max_retries=0)
+
+# Download and save the PDF locally
+url = (
+    "https://storage.googleapis.com/cloud-samples-data/generative-ai/pdf/2403.05530.pdf"
+)
+response = requests.get(url)
+response.raise_for_status()
+
+# Save the PDF locally
+with open("2403.05530.pdf", "wb") as f:
+    f.write(response.content)
+
+# Read the local PDF file
+file = client.files.create(
+    file=open("2403.05530.pdf", "rb"),
+    purpose="user_data", # can be any openai 'purpose' value
+    extra_body={"target_model_names": "gpt-4o-mini-openai, vertex_ai/gemini-2.0-flash"},
+)
+```
+
+#### Retrieve a file - `/files/{file_id}`
+
+```python
+client = OpenAI(base_url="http://0.0.0.0:4000", api_key="sk-1234", max_retries=0)
+
+file = client.files.retrieve(file_id=file.id)
+```
+
+#### Delete a file - `/files/{file_id}/delete`
+
+```python
+client = OpenAI(base_url="http://0.0.0.0:4000", api_key="sk-1234", max_retries=0)
+
+file = client.files.delete(file_id=file.id)
+```
+
+### FAQ
+
+**1. Does LiteLLM store the file?**
+
+No, LiteLLM does not store the file. It only stores the file id's in the postgres DB.
+
+**2. How does LiteLLM know which file to use for a given file id?**
+
+LiteLLM stores a mapping of the litellm file id to the model-specific file id in the postgres DB. When a request comes in, LiteLLM looks up the model-specific file id and uses it in the request to the provider.
+
+**3. How do file deletions work?**
+
+When a file is deleted, LiteLLM deletes the mapping from the postgres DB, and the files on each provider.
+
+### Architecture
+
+
+
+
+
+<Image img={require('../../img/managed_files_arch.png')}  style={{ width: '800px', height: 'auto' }} />
--- a/docs/my-website/docs/proxy/logging.md
+++ b/docs/my-website/docs/proxy/logging.md
@ -277,7 +277,7 @@ Found under `kwargs["standard_logging_object"]`. This is a standard payload, log

 ## Langfuse

-We will use the `--config` to set `litellm.success_callback = ["langfuse"]` this will log all successfull LLM calls to langfuse. Make sure to set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` in your environment
+We will use the `--config` to set `litellm.success_callback = ["langfuse"]` this will log all successful LLM calls to langfuse. Make sure to set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` in your environment

 **Step 1** Install langfuse

@ -535,8 +535,8 @@ print(response)
 Use this if you want to control which LiteLLM-specific fields are logged as tags by the LiteLLM proxy. By default LiteLLM Proxy logs no LiteLLM-specific fields

 | LiteLLM specific field    | Description                                                                             | Example Value                                  |
-|------------------------|-------------------------------------------------------|------------------------------------------------|
-| `cache_hit`            | Indicates whether a cache hit occured (True) or not (False)   | `true`, `false`                                |
+|---------------------------|-----------------------------------------------------------------------------------------|------------------------------------------------|
+| `cache_hit`               | Indicates whether a cache hit occurred (True) or not (False)                            | `true`, `false`                                |
 | `cache_key`               | The Cache key used for this request                                                     | `d2b758c****`                                  |
 | `proxy_base_url`          | The base URL for the proxy server, the value of env var `PROXY_BASE_URL` on your server | `https://proxy.example.com`                    |
 | `user_api_key_alias`      | An alias for the LiteLLM Virtual Key.                                                   | `prod-app1`                                    |
@ -862,7 +862,7 @@ Add the following to your env

 ```shell
 OTEL_EXPORTER="otlp_http"
-OTEL_ENDPOINT="http:/0.0.0.0:4317"
+OTEL_ENDPOINT="http://0.0.0.0:4317"
 OTEL_HEADERS="x-honeycomb-team=<your-api-key>" # Optional
 ```

@ -1190,7 +1190,7 @@ We will use the `--config` to set

 - `litellm.success_callback = ["s3"]` 

-This will log all successfull LLM calls to s3 Bucket
+This will log all successful LLM calls to s3 Bucket

 **Step 1** Set AWS Credentials in .env

@ -1279,7 +1279,7 @@ Log LLM Logs to [Azure Data Lake Storage](https://learn.microsoft.com/en-us/azur

 | Property | Details |
 |----------|---------|
-| Description | Log LLM Input/Output to Azure Blob Storag (Bucket) |
+| Description | Log LLM Input/Output to Azure Blob Storage (Bucket) |
 | Azure Docs on Data Lake Storage | [Azure Data Lake Storage](https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction) |


@ -1360,7 +1360,7 @@ LiteLLM Supports logging to the following Datdog Integrations:
 <Tabs>
 <TabItem value="datadog" label="Datadog Logs">

-We will use the `--config` to set `litellm.callbacks = ["datadog"]` this will log all successfull LLM calls to DataDog
+We will use the `--config` to set `litellm.callbacks = ["datadog"]` this will log all successful LLM calls to DataDog

 **Step 1**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`

@ -1636,7 +1636,7 @@ class MyCustomHandler(CustomLogger):
            litellm_params = kwargs.get("litellm_params", {})
            metadata = litellm_params.get("metadata", {})   # headers passed to LiteLLM proxy, can be found here

-            # Acess Exceptions & Traceback
+            # Access Exceptions & Traceback
            exception_event = kwargs.get("exception", None)
            traceback_event = kwargs.get("traceback_exception", None)

@ -2205,7 +2205,7 @@ We will use the `--config` to set
 - `litellm.success_callback = ["dynamodb"]` 
 - `litellm.dynamodb_table_name = "your-table-name"`

-This will log all successfull LLM calls to DynamoDB
+This will log all successful LLM calls to DynamoDB

 **Step 1** Set AWS Credentials in .env

@ -2370,7 +2370,7 @@ litellm --test

 [Athina](https://athina.ai/) allows you to log LLM Input/Output for monitoring, analytics, and observability.

-We will use the `--config` to set `litellm.success_callback = ["athina"]` this will log all successfull LLM calls to athina
+We will use the `--config` to set `litellm.success_callback = ["athina"]` this will log all successful LLM calls to athina

 **Step 1** Set Athina API key

--- a/docs/my-website/docs/proxy/model_discovery.md
+++ b/docs/my-website/docs/proxy/model_discovery.md
@ -0,0 +1,108 @@
+# Model Discovery
+
+Use this to give users an accurate list of models available behind provider endpoint, when calling `/v1/models` for wildcard models.
+
+## Supported Models
+
+- Fireworks AI
+- OpenAI
+- Gemini
+- LiteLLM Proxy
+- Topaz
+- Anthropic
+- XAI
+- VLLM
+- Vertex AI
+
+### Usage
+
+**1. Setup config.yaml**
+
+```yaml
+model_list:
+    - model_name: xai/*
+      litellm_params:
+        model: xai/*
+        api_key: os.environ/XAI_API_KEY
+
+litellm_settings:
+    check_provider_endpoint: true # 👈 Enable checking provider endpoint for wildcard models
+```
+
+**2. Start proxy**
+
+```bash
+litellm --config /path/to/config.yaml
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+**3. Call `/v1/models`**
+
+```bash
+curl -X GET "http://localhost:4000/v1/models" -H "Authorization: Bearer $LITELLM_KEY"
+```
+
+Expected response
+
+```json
+{
+    "data": [
+        {
+            "id": "xai/grok-2-1212",
+            "object": "model",
+            "created": 1677610602,
+            "owned_by": "openai"
+        },
+        {
+            "id": "xai/grok-2-vision-1212",
+            "object": "model",
+            "created": 1677610602,
+            "owned_by": "openai"
+        },
+        {
+            "id": "xai/grok-3-beta",
+            "object": "model",
+            "created": 1677610602,
+            "owned_by": "openai"
+        },
+        {
+            "id": "xai/grok-3-fast-beta",
+            "object": "model",
+            "created": 1677610602,
+            "owned_by": "openai"
+        },
+        {
+            "id": "xai/grok-3-mini-beta",
+            "object": "model",
+            "created": 1677610602,
+            "owned_by": "openai"
+        },
+        {
+            "id": "xai/grok-3-mini-fast-beta",
+            "object": "model",
+            "created": 1677610602,
+            "owned_by": "openai"
+        },
+        {
+            "id": "xai/grok-beta",
+            "object": "model",
+            "created": 1677610602,
+            "owned_by": "openai"
+        },
+        {
+            "id": "xai/grok-vision-beta",
+            "object": "model",
+            "created": 1677610602,
+            "owned_by": "openai"
+        },
+        {
+            "id": "xai/grok-2-image-1212",
+            "object": "model",
+            "created": 1677610602,
+            "owned_by": "openai"
+        }
+    ],
+    "object": "list"
+}
+```
--- a/docs/my-website/docs/proxy/prod.md
+++ b/docs/my-website/docs/proxy/prod.md
@ -61,7 +61,7 @@ CMD ["--port", "4000", "--config", "./proxy_server_config.yaml"]

 ## 3. Use Redis 'port','host', 'password'. NOT 'redis_url'

-If you decide to use Redis, DO NOT use 'redis_url'. We recommend usig redis port, host, and password params. 
+If you decide to use Redis, DO NOT use 'redis_url'. We recommend using redis port, host, and password params. 

 `redis_url`is 80 RPS slower

@ -169,7 +169,7 @@ If you plan on using the DB, set a salt key for encrypting/decrypting variables

 Do not change this after adding a model. It is used to encrypt / decrypt your LLM API Key credentials

-We recommned - https://1password.com/password-generator/ password generator to get a random hash for litellm salt key.
+We recommend - https://1password.com/password-generator/ password generator to get a random hash for litellm salt key.

 ```bash
 export LITELLM_SALT_KEY="sk-1234"
@ -177,6 +177,50 @@ export LITELLM_SALT_KEY="sk-1234"

 [**See Code**](https://github.com/BerriAI/litellm/blob/036a6821d588bd36d170713dcf5a72791a694178/litellm/proxy/common_utils/encrypt_decrypt_utils.py#L15)

+
+## 9. Use `prisma migrate deploy`
+
+Use this to handle db migrations across LiteLLM versions in production
+
+<Tabs>
+<TabItem value="env" label="ENV">
+
+```bash
+USE_PRISMA_MIGRATE="True"
+```
+
+</TabItem>
+
+<TabItem value="cli" label="CLI">
+
+```bash
+litellm --use_prisma_migrate
+```
+
+</TabItem>
+</Tabs>
+
+Benefits:
+
+The migrate deploy command:
+
+- **Does not** issue a warning if an already applied migration is missing from migration history
+- **Does not** detect drift (production database schema differs from migration history end state - for example, due to a hotfix)
+- **Does not** reset the database or generate artifacts (such as Prisma Client)
+- **Does not** rely on a shadow database
+
+
+### How does LiteLLM handle DB migrations in production?
+
+1. A new migration file is written to our `litellm-proxy-extras` package. [See all](https://github.com/BerriAI/litellm/tree/main/litellm-proxy-extras/litellm_proxy_extras/migrations)
+
+2. The core litellm pip package is bumped to point to the new `litellm-proxy-extras` package. This ensures, older versions of LiteLLM will continue to use the old migrations. [See code](https://github.com/BerriAI/litellm/blob/52b35cd8093b9ad833987b24f494586a1e923209/pyproject.toml#L58)
+
+3. When you upgrade to a new version of LiteLLM, the migration file is applied to the database. [See code](https://github.com/BerriAI/litellm/blob/52b35cd8093b9ad833987b24f494586a1e923209/litellm-proxy-extras/litellm_proxy_extras/utils.py#L42)
+
+
+
+
 ## Extras
 ### Expected Performance in Production

--- a/docs/my-website/docs/proxy/prometheus.md
+++ b/docs/my-website/docs/proxy/prometheus.md
@ -95,7 +95,14 @@ Use this for for tracking per [user, key, team, etc.](virtual_keys)

 ### Initialize Budget Metrics on Startup

-If you want to initialize the key/team budget metrics on startup, you can set the `prometheus_initialize_budget_metrics` to `true` in the `config.yaml`
+If you want litellm to emit the budget metrics for all keys, teams irrespective of whether they are getting requests or not, set `prometheus_initialize_budget_metrics` to `true` in the `config.yaml`
+
+**How this works:**
+
+- If the `prometheus_initialize_budget_metrics` is set to `true`
+  - Every 5 minutes litellm runs a cron job to read all keys, teams from the database
+  - It then emits the budget metrics for each key, team
+  - This is used to populate the budget metrics on the `/metrics` endpoint

 ```yaml
 litellm_settings:
--- a/docs/my-website/docs/proxy/self_serve.md
+++ b/docs/my-website/docs/proxy/self_serve.md
@ -161,6 +161,83 @@ Here's the available UI roles for a LiteLLM Internal User:
  - `internal_user`: can login, view/create/delete their own keys, view their spend. **Cannot** add new users.
  - `internal_user_viewer`: can login, view their own keys, view their own spend. **Cannot** create/delete keys, add new users.

+## Auto-add SSO users to teams
+
+This walks through setting up sso auto-add for **Okta, Google SSO**
+
+### Okta, Google SSO 
+
+1. Specify the JWT field that contains the team ids, that the user belongs to. 
+
+```yaml
+general_settings:
+  master_key: sk-1234
+  litellm_jwtauth:
+    team_ids_jwt_field: "groups" # 👈 CAN BE ANY FIELD
+```
+
+This is assuming your SSO token looks like this. **If you need to inspect the JWT fields received from your SSO provider by LiteLLM, follow these instructions [here](#debugging-sso-jwt-fields)**
+
+```
+{
+  ...,
+  "groups": ["team_id_1", "team_id_2"]
+}
+```
+
+2. Create the teams on LiteLLM 
+
+```bash
+curl -X POST '<PROXY_BASE_URL>/team/new' \
+-H 'Authorization: Bearer <PROXY_MASTER_KEY>' \
+-H 'Content-Type: application/json' \
+-D '{
+    "team_alias": "team_1",
+    "team_id": "team_id_1" # 👈 MUST BE THE SAME AS THE SSO GROUP ID
+}'
+```
+
+3. Test the SSO flow
+
+Here's a walkthrough of [how it works](https://www.loom.com/share/8959be458edf41fd85937452c29a33f3?sid=7ebd6d37-569a-4023-866e-e0cde67cb23e)
+
+### Microsoft Entra ID SSO group assignment
+
+Follow this [tutorial for auto-adding sso users to teams with Microsoft Entra ID](https://docs.litellm.ai/docs/tutorials/msft_sso)
+
+### Debugging SSO JWT fields 
+
+If you need to inspect the JWT fields received from your SSO provider by LiteLLM, follow these instructions. This guide walks you through setting up a debug callback to view the JWT data during the SSO process.
+
+
+<Image img={require('../../img/debug_sso.png')}  style={{ width: '500px', height: 'auto' }} />
+<br />
+
+1. Add `/sso/debug/callback` as a redirect URL in your SSO provider 
+
+  In your SSO provider's settings, add the following URL as a new redirect (callback) URL:
+
+  ```bash showLineNumbers title="Redirect URL"
+  http://<proxy_base_url>/sso/debug/callback
+  ```
+
+
+2. Navigate to the debug login page on your browser 
+
+    Navigate to the following URL on your browser:
+
+    ```bash showLineNumbers title="URL to navigate to"
+    https://<proxy_base_url>/sso/debug/login
+    ```
+
+    This will initiate the standard SSO flow. You will be redirected to your SSO provider's login screen, and after successful authentication, you will be redirected back to LiteLLM's debug callback route.
+
+
+3. View the JWT fields 
+
+Once redirected, you should see a page called "SSO Debug Information". This page displays the JWT fields received from your SSO provider (as shown in the image above)
+
+
 ## Advanced
 ### Setting custom logout URLs

@ -196,40 +273,26 @@ This budget does not apply to keys created under non-default teams.

 [**Go Here**](./team_budgets.md)

-### Auto-add SSO users to teams
+### Set default params for new teams

-1. Specify the JWT field that contains the team ids, that the user belongs to. 
+When you connect litellm to your SSO provider, litellm can auto-create teams. Use this to set the default `models`, `max_budget`, `budget_duration` for these auto-created teams. 

-```yaml
-general_settings:
-  master_key: sk-1234
-  litellm_jwtauth:
-    team_ids_jwt_field: "groups" # 👈 CAN BE ANY FIELD
+**How it works**
+
+1. When litellm fetches `groups` from your SSO provider, it will check if the corresponding group_id exists as a `team_id` in litellm. 
+2. If the team_id does not exist, litellm will auto-create a team with the default params you've set. 
+3. If the team_id already exist, litellm will not apply any settings on the team. 
+
+**Usage**
+
+```yaml showLineNumbers title="Default Params for new teams"
+litellm_settings:
+  default_team_params:             # Default Params to apply when litellm auto creates a team from SSO IDP provider
+    max_budget: 100                # Optional[float], optional): $100 budget for the team
+    budget_duration: 30d           # Optional[str], optional): 30 days budget_duration for the team
+    models: ["gpt-3.5-turbo"]      # Optional[List[str]], optional): models to be used by the team
 ```

-This is assuming your SSO token looks like this:
-```
-{
-  ...,
-  "groups": ["team_id_1", "team_id_2"]
-}
-```
-
-2. Create the teams on LiteLLM 
-
-```bash
-curl -X POST '<PROXY_BASE_URL>/team/new' \
-H 'Authorization: Bearer <PROXY_MASTER_KEY>' \
-H 'Content-Type: application/json' \
-D '{
-    "team_alias": "team_1",
-    "team_id": "team_id_1" # 👈 MUST BE THE SAME AS THE SSO GROUP ID
-}'
-```
-
-3. Test the SSO flow
-
-Here's a walkthrough of [how it works](https://www.loom.com/share/8959be458edf41fd85937452c29a33f3?sid=7ebd6d37-569a-4023-866e-e0cde67cb23e)

 ### Restrict Users from creating personal keys 

@ -241,7 +304,7 @@ This will also prevent users from using their session tokens on the test keys ch

 ## **All Settings for Self Serve / SSO Flow**

-```yaml
+```yaml showLineNumbers title="All Settings for Self Serve / SSO Flow"
 litellm_settings:
  max_internal_user_budget: 10        # max budget for internal users
  internal_user_budget_duration: "1mo" # reset every month
@ -252,6 +315,11 @@ litellm_settings:
    budget_duration: 30d           # Optional[str], optional): 30 days budget_duration for a new SSO sign in user
    models: ["gpt-3.5-turbo"]      # Optional[List[str]], optional): models to be used by a new SSO sign in user
  
+  default_team_params:             # Default Params to apply when litellm auto creates a team from SSO IDP provider
+    max_budget: 100                # Optional[float], optional): $100 budget for the team
+    budget_duration: 30d           # Optional[str], optional): 30 days budget_duration for the team
+    models: ["gpt-3.5-turbo"]      # Optional[List[str]], optional): models to be used by the team
+

  upperbound_key_generate_params:    # Upperbound for /key/generate requests when self-serve flow is on
    max_budget: 100 # Optional[float], optional): upperbound of $100, for all /key/generate requests
--- a/docs/my-website/docs/proxy/temporary_budget_increase.md
+++ b/docs/my-website/docs/proxy/temporary_budget_increase.md
@ -3,7 +3,7 @@
 Set temporary budget increase for a LiteLLM Virtual Key. Use this if you get asked to increase the budget for a key temporarily.


-| Heirarchy | Supported | 
+| Hierarchy | Supported | 
 |-----------|-----------|
 | LiteLLM Virtual Key | ✅ |
 | User | ❌ |
--- a/docs/my-website/docs/proxy/ui_credentials.md
+++ b/docs/my-website/docs/proxy/ui_credentials.md
@ -4,7 +4,7 @@ import TabItem from '@theme/TabItem';

 # Adding LLM Credentials

-You can add LLM provider credentials on the UI. Once you add credentials you can re-use them when adding new models
+You can add LLM provider credentials on the UI. Once you add credentials you can reuse them when adding new models

 ## Add a credential + model

--- a/docs/my-website/docs/proxy/virtual_keys.md
+++ b/docs/my-website/docs/proxy/virtual_keys.md
@ -23,7 +23,7 @@ Requirements:
  - ** Set on config.yaml** set your master key under `general_settings:master_key`, example below
  - ** Set env variable** set `LITELLM_MASTER_KEY`

-(the proxy Dockerfile checks if the `DATABASE_URL` is set and then intializes the DB connection)
+(the proxy Dockerfile checks if the `DATABASE_URL` is set and then initializes the DB connection)

 ```shell
 export DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname>
@ -333,7 +333,7 @@ curl http://localhost:4000/v1/chat/completions \

 **Expected Response**

-Expect to see a successfull response from the litellm proxy since the key passed in `X-Litellm-Key` is valid
+Expect to see a successful response from the litellm proxy since the key passed in `X-Litellm-Key` is valid
 ```shell
 {"id":"chatcmpl-f9b2b79a7c30477ab93cd0e717d1773e","choices":[{"finish_reason":"stop","index":0,"message":{"content":"\n\nHello there, how may I assist you today?","role":"assistant","tool_calls":null,"function_call":null}}],"created":1677652288,"model":"gpt-3.5-turbo-0125","object":"chat.completion","system_fingerprint":"fp_44709d6fcb","usage":{"completion_tokens":12,"prompt_tokens":9,"total_tokens":21}
 ```
--- a/docs/my-website/docs/reasoning_content.md
+++ b/docs/my-website/docs/reasoning_content.md
@ -15,14 +15,17 @@ Supported Providers:
 - Bedrock (Anthropic + Deepseek) (`bedrock/`)
 - Vertex AI (Anthropic) (`vertexai/`)
 - OpenRouter (`openrouter/`)
+- XAI (`xai/`)
+- Google AI Studio (`google/`)
+- Vertex AI (`vertex_ai/`)

 LiteLLM will standardize the `reasoning_content` in the response and `thinking_blocks` in the assistant message.

-```python
+```python title="Example response from litellm"
 "message": {
    ...
    "reasoning_content": "The capital of France is Paris.",
-    "thinking_blocks": [
+    "thinking_blocks": [ # only returned for Anthropic models
        {
            "type": "thinking",
            "thinking": "The capital of France is Paris.",
@ -37,7 +40,7 @@ LiteLLM will standardize the `reasoning_content` in the response and `thinking_b
 <Tabs>
 <TabItem value="sdk" label="SDK">

-```python
+```python showLineNumbers
 from litellm import completion
 import os 

@ -111,7 +114,7 @@ Here's how to use `thinking` blocks by Anthropic with tool calling.
 <Tabs>
 <TabItem value="sdk" label="SDK">

-```python
+```python showLineNumbers
 litellm._turn_on_debug()
 litellm.modify_params = True
 model = "anthropic/claude-3-7-sonnet-20250219" # works across Anthropic, Bedrock, Vertex AI
@ -210,7 +213,7 @@ if tool_calls:

 1. Setup config.yaml

-```yaml
+```yaml showLineNumbers
 model_list:
  - model_name: claude-3-7-sonnet-thinking
    litellm_params:
@ -224,7 +227,7 @@ model_list:

 2. Run proxy

-```bash
+```bash showLineNumbers
 litellm --config config.yaml

 # RUNNING on http://0.0.0.0:4000
@ -332,7 +335,7 @@ curl http://0.0.0.0:4000/v1/chat/completions \

 Set `drop_params=True` to drop the 'thinking' blocks when swapping from Anthropic to Deepseek models. Suggest improvements to this approach [here](https://github.com/BerriAI/litellm/discussions/8927).

-```python
+```python showLineNumbers
 litellm.drop_params = True # 👈 EITHER GLOBALLY or per request

 # or per request
@ -373,7 +376,7 @@ You can also pass the `thinking` parameter to Anthropic models.
 <Tabs>
 <TabItem value="sdk" label="SDK">

-```python
+```python showLineNumbers
 response = litellm.completion(
  model="anthropic/claude-3-7-sonnet-20250219",
  messages=[{"role": "user", "content": "What is the capital of France?"}],
@ -395,5 +398,92 @@ curl http://0.0.0.0:4000/v1/chat/completions \
  }'
 ```

+</TabItem>
+</Tabs>
+
+## Checking if a model supports reasoning
+
+<Tabs>
+<TabItem label="LiteLLM Python SDK" value="Python">
+
+Use `litellm.supports_reasoning(model="")` -> returns `True` if model supports reasoning and `False` if not.
+
+```python showLineNumbers title="litellm.supports_reasoning() usage"
+import litellm 
+
+# Example models that support reasoning
+assert litellm.supports_reasoning(model="anthropic/claude-3-7-sonnet-20250219") == True
+assert litellm.supports_reasoning(model="deepseek/deepseek-chat") == True 
+
+# Example models that do not support reasoning
+assert litellm.supports_reasoning(model="openai/gpt-3.5-turbo") == False 
+```
+</TabItem>
+
+<TabItem label="LiteLLM Proxy Server" value="proxy">
+
+1. Define models that support reasoning in your `config.yaml`. You can optionally add `supports_reasoning: True` to the `model_info` if LiteLLM does not automatically detect it for your custom model.
+
+```yaml showLineNumbers title="litellm proxy config.yaml"
+model_list:
+  - model_name: claude-3-sonnet-reasoning
+    litellm_params:
+      model: anthropic/claude-3-7-sonnet-20250219
+      api_key: os.environ/ANTHROPIC_API_KEY
+  - model_name: deepseek-reasoning
+    litellm_params:
+      model: deepseek/deepseek-chat
+      api_key: os.environ/DEEPSEEK_API_KEY
+  # Example for a custom model where detection might be needed
+  - model_name: my-custom-reasoning-model 
+    litellm_params:
+      model: openai/my-custom-model # Assuming it's OpenAI compatible
+      api_base: http://localhost:8000
+      api_key: fake-key
+    model_info:
+      supports_reasoning: True # Explicitly mark as supporting reasoning
+```
+
+2. Run the proxy server:
+
+```bash showLineNumbers title="litellm --config config.yaml"
+litellm --config config.yaml
+```
+
+3. Call `/model_group/info` to check if your model supports `reasoning`
+
+```shell showLineNumbers title="curl /model_group/info"
+curl -X 'GET' \
+  'http://localhost:4000/model_group/info' \
+  -H 'accept: application/json' \
+  -H 'x-api-key: sk-1234'
+```
+
+Expected Response 
+
+```json showLineNumbers title="response from /model_group/info"
+{
+  "data": [
+    {
+      "model_group": "claude-3-sonnet-reasoning",
+      "providers": ["anthropic"],
+      "mode": "chat",
+      "supports_reasoning": true,
+    },
+    {
+      "model_group": "deepseek-reasoning",
+      "providers": ["deepseek"],
+      "supports_reasoning": true,
+    },
+    {
+      "model_group": "my-custom-reasoning-model",
+      "providers": ["openai"],
+      "supports_reasoning": true,
+    }
+  ]
+}
+````
+
+
 </TabItem>
 </Tabs>
--- a/docs/my-website/docs/response_api.md
+++ b/docs/my-website/docs/response_api.md
@ -14,22 +14,22 @@ LiteLLM provides a BETA endpoint in the spec of [OpenAI's `/responses` API](http
 | Fallbacks | ✅ | Works between supported models |
 | Loadbalancing | ✅ | Works between supported models |
 | Supported LiteLLM Versions | 1.63.8+ | |
-| Supported LLM providers | `openai` | |
+| Supported LLM providers | **All LiteLLM supported providers** | `openai`, `anthropic`, `bedrock`, `vertex_ai`, `gemini`, `azure`, `azure_ai` etc. |

 ## Usage

-## Create a model response
+### LiteLLM Python SDK

 <Tabs>
-<TabItem value="litellm-sdk" label="LiteLLM SDK">
+<TabItem value="openai" label="OpenAI">

 #### Non-streaming
-```python
+```python showLineNumbers title="OpenAI Non-streaming Response"
 import litellm

 # Non-streaming response
 response = litellm.responses(
-    model="o1-pro",
+    model="openai/o1-pro",
    input="Tell me a three sentence bedtime story about a unicorn.",
    max_output_tokens=100
 )
@ -38,12 +38,12 @@ print(response)
 ```

 #### Streaming
-```python
+```python showLineNumbers title="OpenAI Streaming Response"
 import litellm

 # Streaming response
 response = litellm.responses(
-    model="o1-pro",
+    model="openai/o1-pro",
    input="Tell me a three sentence bedtime story about a unicorn.",
    stream=True
 )
@ -53,58 +53,169 @@ for event in response:
 ```

 </TabItem>
-<TabItem value="proxy" label="OpenAI SDK with LiteLLM Proxy">

-First, add this to your litellm proxy config.yaml:
-```yaml
-model_list:
-  - model_name: o1-pro
-    litellm_params:
-      model: openai/o1-pro
-      api_key: os.environ/OPENAI_API_KEY
-```
-
-Start your LiteLLM proxy:
-```bash
-litellm --config /path/to/config.yaml
-
-# RUNNING on http://0.0.0.0:4000
-```
-
-Then use the OpenAI SDK pointed to your proxy:
+<TabItem value="anthropic" label="Anthropic">

 #### Non-streaming
-```python
-from openai import OpenAI
+```python showLineNumbers title="Anthropic Non-streaming Response"
+import litellm
+import os

-# Initialize client with your proxy URL
-client = OpenAI(
-    base_url="http://localhost:4000",  # Your proxy URL
-    api_key="your-api-key"             # Your proxy API key
-)
+# Set API key
+os.environ["ANTHROPIC_API_KEY"] = "your-anthropic-api-key"

 # Non-streaming response
-response = client.responses.create(
-    model="o1-pro",
-    input="Tell me a three sentence bedtime story about a unicorn."
+response = litellm.responses(
+    model="anthropic/claude-3-5-sonnet-20240620",
+    input="Tell me a three sentence bedtime story about a unicorn.",
+    max_output_tokens=100
 )

 print(response)
 ```

 #### Streaming
-```python
-from openai import OpenAI
+```python showLineNumbers title="Anthropic Streaming Response"
+import litellm
+import os

-# Initialize client with your proxy URL
-client = OpenAI(
-    base_url="http://localhost:4000",  # Your proxy URL
-    api_key="your-api-key"             # Your proxy API key
-)
+# Set API key
+os.environ["ANTHROPIC_API_KEY"] = "your-anthropic-api-key"

 # Streaming response
-response = client.responses.create(
-    model="o1-pro",
+response = litellm.responses(
+    model="anthropic/claude-3-5-sonnet-20240620",
+    input="Tell me a three sentence bedtime story about a unicorn.",
+    stream=True
+)
+
+for event in response:
+    print(event)
+```
+
+</TabItem>
+
+<TabItem value="vertex" label="Vertex AI">
+
+#### Non-streaming
+```python showLineNumbers title="Vertex AI Non-streaming Response"
+import litellm
+import os
+
+# Set credentials - Vertex AI uses application default credentials
+# Run 'gcloud auth application-default login' to authenticate
+os.environ["VERTEXAI_PROJECT"] = "your-gcp-project-id"
+os.environ["VERTEXAI_LOCATION"] = "us-central1"
+
+# Non-streaming response
+response = litellm.responses(
+    model="vertex_ai/gemini-1.5-pro",
+    input="Tell me a three sentence bedtime story about a unicorn.",
+    max_output_tokens=100
+)
+
+print(response)
+```
+
+#### Streaming
+```python showLineNumbers title="Vertex AI Streaming Response"
+import litellm
+import os
+
+# Set credentials - Vertex AI uses application default credentials
+# Run 'gcloud auth application-default login' to authenticate
+os.environ["VERTEXAI_PROJECT"] = "your-gcp-project-id"
+os.environ["VERTEXAI_LOCATION"] = "us-central1"
+
+# Streaming response
+response = litellm.responses(
+    model="vertex_ai/gemini-1.5-pro",
+    input="Tell me a three sentence bedtime story about a unicorn.",
+    stream=True
+)
+
+for event in response:
+    print(event)
+```
+
+</TabItem>
+
+<TabItem value="bedrock" label="AWS Bedrock">
+
+#### Non-streaming
+```python showLineNumbers title="AWS Bedrock Non-streaming Response"
+import litellm
+import os
+
+# Set AWS credentials
+os.environ["AWS_ACCESS_KEY_ID"] = "your-access-key-id"
+os.environ["AWS_SECRET_ACCESS_KEY"] = "your-secret-access-key"
+os.environ["AWS_REGION_NAME"] = "us-west-2"  # or your AWS region
+
+# Non-streaming response
+response = litellm.responses(
+    model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
+    input="Tell me a three sentence bedtime story about a unicorn.",
+    max_output_tokens=100
+)
+
+print(response)
+```
+
+#### Streaming
+```python showLineNumbers title="AWS Bedrock Streaming Response"
+import litellm
+import os
+
+# Set AWS credentials
+os.environ["AWS_ACCESS_KEY_ID"] = "your-access-key-id"
+os.environ["AWS_SECRET_ACCESS_KEY"] = "your-secret-access-key"
+os.environ["AWS_REGION_NAME"] = "us-west-2"  # or your AWS region
+
+# Streaming response
+response = litellm.responses(
+    model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
+    input="Tell me a three sentence bedtime story about a unicorn.",
+    stream=True
+)
+
+for event in response:
+    print(event)
+```
+
+</TabItem>
+
+<TabItem value="gemini" label="Google AI Studio">
+
+#### Non-streaming
+```python showLineNumbers title="Google AI Studio Non-streaming Response"
+import litellm
+import os
+
+# Set API key for Google AI Studio
+os.environ["GEMINI_API_KEY"] = "your-gemini-api-key"
+
+# Non-streaming response
+response = litellm.responses(
+    model="gemini/gemini-1.5-flash",
+    input="Tell me a three sentence bedtime story about a unicorn.",
+    max_output_tokens=100
+)
+
+print(response)
+```
+
+#### Streaming
+```python showLineNumbers title="Google AI Studio Streaming Response"
+import litellm
+import os
+
+# Set API key for Google AI Studio
+os.environ["GEMINI_API_KEY"] = "your-gemini-api-key"
+
+# Streaming response
+response = litellm.responses(
+    model="gemini/gemini-1.5-flash",
    input="Tell me a three sentence bedtime story about a unicorn.",
    stream=True
 )
@ -115,3 +226,408 @@ for event in response:

 </TabItem>
 </Tabs>
+
+### LiteLLM Proxy with OpenAI SDK
+
+First, set up and start your LiteLLM proxy server.
+
+```bash title="Start LiteLLM Proxy Server"
+litellm --config /path/to/config.yaml
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+<Tabs>
+<TabItem value="openai" label="OpenAI">
+
+First, add this to your litellm proxy config.yaml:
+```yaml showLineNumbers title="OpenAI Proxy Configuration"
+model_list:
+  - model_name: openai/o1-pro
+    litellm_params:
+      model: openai/o1-pro
+      api_key: os.environ/OPENAI_API_KEY
+```
+
+#### Non-streaming
+```python showLineNumbers title="OpenAI Proxy Non-streaming Response"
+from openai import OpenAI
+
+# Initialize client with your proxy URL
+client = OpenAI(
+    base_url="http://localhost:4000",  # Your proxy URL
+    api_key="your-api-key"             # Your proxy API key
+)
+
+# Non-streaming response
+response = client.responses.create(
+    model="openai/o1-pro",
+    input="Tell me a three sentence bedtime story about a unicorn."
+)
+
+print(response)
+```
+
+#### Streaming
+```python showLineNumbers title="OpenAI Proxy Streaming Response"
+from openai import OpenAI
+
+# Initialize client with your proxy URL
+client = OpenAI(
+    base_url="http://localhost:4000",  # Your proxy URL
+    api_key="your-api-key"             # Your proxy API key
+)
+
+# Streaming response
+response = client.responses.create(
+    model="openai/o1-pro",
+    input="Tell me a three sentence bedtime story about a unicorn.",
+    stream=True
+)
+
+for event in response:
+    print(event)
+```
+
+</TabItem>
+
+<TabItem value="anthropic" label="Anthropic">
+
+First, add this to your litellm proxy config.yaml:
+```yaml showLineNumbers title="Anthropic Proxy Configuration"
+model_list:
+  - model_name: anthropic/claude-3-5-sonnet-20240620
+    litellm_params:
+      model: anthropic/claude-3-5-sonnet-20240620
+      api_key: os.environ/ANTHROPIC_API_KEY
+```
+
+#### Non-streaming
+```python showLineNumbers title="Anthropic Proxy Non-streaming Response"
+from openai import OpenAI
+
+# Initialize client with your proxy URL
+client = OpenAI(
+    base_url="http://localhost:4000",  # Your proxy URL
+    api_key="your-api-key"             # Your proxy API key
+)
+
+# Non-streaming response
+response = client.responses.create(
+    model="anthropic/claude-3-5-sonnet-20240620",
+    input="Tell me a three sentence bedtime story about a unicorn."
+)
+
+print(response)
+```
+
+#### Streaming
+```python showLineNumbers title="Anthropic Proxy Streaming Response"
+from openai import OpenAI
+
+# Initialize client with your proxy URL
+client = OpenAI(
+    base_url="http://localhost:4000",  # Your proxy URL
+    api_key="your-api-key"             # Your proxy API key
+)
+
+# Streaming response
+response = client.responses.create(
+    model="anthropic/claude-3-5-sonnet-20240620",
+    input="Tell me a three sentence bedtime story about a unicorn.",
+    stream=True
+)
+
+for event in response:
+    print(event)
+```
+
+</TabItem>
+
+<TabItem value="vertex" label="Vertex AI">
+
+First, add this to your litellm proxy config.yaml:
+```yaml showLineNumbers title="Vertex AI Proxy Configuration"
+model_list:
+  - model_name: vertex_ai/gemini-1.5-pro
+    litellm_params:
+      model: vertex_ai/gemini-1.5-pro
+      vertex_project: your-gcp-project-id
+      vertex_location: us-central1
+```
+
+#### Non-streaming
+```python showLineNumbers title="Vertex AI Proxy Non-streaming Response"
+from openai import OpenAI
+
+# Initialize client with your proxy URL
+client = OpenAI(
+    base_url="http://localhost:4000",  # Your proxy URL
+    api_key="your-api-key"             # Your proxy API key
+)
+
+# Non-streaming response
+response = client.responses.create(
+    model="vertex_ai/gemini-1.5-pro",
+    input="Tell me a three sentence bedtime story about a unicorn."
+)
+
+print(response)
+```
+
+#### Streaming
+```python showLineNumbers title="Vertex AI Proxy Streaming Response"
+from openai import OpenAI
+
+# Initialize client with your proxy URL
+client = OpenAI(
+    base_url="http://localhost:4000",  # Your proxy URL
+    api_key="your-api-key"             # Your proxy API key
+)
+
+# Streaming response
+response = client.responses.create(
+    model="vertex_ai/gemini-1.5-pro",
+    input="Tell me a three sentence bedtime story about a unicorn.",
+    stream=True
+)
+
+for event in response:
+    print(event)
+```
+
+</TabItem>
+
+<TabItem value="bedrock" label="AWS Bedrock">
+
+First, add this to your litellm proxy config.yaml:
+```yaml showLineNumbers title="AWS Bedrock Proxy Configuration"
+model_list:
+  - model_name: bedrock/anthropic.claude-3-sonnet-20240229-v1:0
+    litellm_params:
+      model: bedrock/anthropic.claude-3-sonnet-20240229-v1:0
+      aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID
+      aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY
+      aws_region_name: us-west-2
+```
+
+#### Non-streaming
+```python showLineNumbers title="AWS Bedrock Proxy Non-streaming Response"
+from openai import OpenAI
+
+# Initialize client with your proxy URL
+client = OpenAI(
+    base_url="http://localhost:4000",  # Your proxy URL
+    api_key="your-api-key"             # Your proxy API key
+)
+
+# Non-streaming response
+response = client.responses.create(
+    model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
+    input="Tell me a three sentence bedtime story about a unicorn."
+)
+
+print(response)
+```
+
+#### Streaming
+```python showLineNumbers title="AWS Bedrock Proxy Streaming Response"
+from openai import OpenAI
+
+# Initialize client with your proxy URL
+client = OpenAI(
+    base_url="http://localhost:4000",  # Your proxy URL
+    api_key="your-api-key"             # Your proxy API key
+)
+
+# Streaming response
+response = client.responses.create(
+    model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
+    input="Tell me a three sentence bedtime story about a unicorn.",
+    stream=True
+)
+
+for event in response:
+    print(event)
+```
+
+</TabItem>
+
+<TabItem value="gemini" label="Google AI Studio">
+
+First, add this to your litellm proxy config.yaml:
+```yaml showLineNumbers title="Google AI Studio Proxy Configuration"
+model_list:
+  - model_name: gemini/gemini-1.5-flash
+    litellm_params:
+      model: gemini/gemini-1.5-flash
+      api_key: os.environ/GEMINI_API_KEY
+```
+
+#### Non-streaming
+```python showLineNumbers title="Google AI Studio Proxy Non-streaming Response"
+from openai import OpenAI
+
+# Initialize client with your proxy URL
+client = OpenAI(
+    base_url="http://localhost:4000",  # Your proxy URL
+    api_key="your-api-key"             # Your proxy API key
+)
+
+# Non-streaming response
+response = client.responses.create(
+    model="gemini/gemini-1.5-flash",
+    input="Tell me a three sentence bedtime story about a unicorn."
+)
+
+print(response)
+```
+
+#### Streaming
+```python showLineNumbers title="Google AI Studio Proxy Streaming Response"
+from openai import OpenAI
+
+# Initialize client with your proxy URL
+client = OpenAI(
+    base_url="http://localhost:4000",  # Your proxy URL
+    api_key="your-api-key"             # Your proxy API key
+)
+
+# Streaming response
+response = client.responses.create(
+    model="gemini/gemini-1.5-flash",
+    input="Tell me a three sentence bedtime story about a unicorn.",
+    stream=True
+)
+
+for event in response:
+    print(event)
+```
+
+</TabItem>
+</Tabs>
+
+## Supported Responses API Parameters
+
+| Provider | Supported Parameters |
+|----------|---------------------|
+| `openai` | [All Responses API parameters are supported](https://github.com/BerriAI/litellm/blob/7c3df984da8e4dff9201e4c5353fdc7a2b441831/litellm/llms/openai/responses/transformation.py#L23) |
+| `azure` | [All Responses API parameters are supported](https://github.com/BerriAI/litellm/blob/7c3df984da8e4dff9201e4c5353fdc7a2b441831/litellm/llms/openai/responses/transformation.py#L23) |
+| `anthropic` | [See supported parameters here](https://github.com/BerriAI/litellm/blob/f39d9178868662746f159d5ef642c7f34f9bfe5f/litellm/responses/litellm_completion_transformation/transformation.py#L57) |
+| `bedrock` | [See supported parameters here](https://github.com/BerriAI/litellm/blob/f39d9178868662746f159d5ef642c7f34f9bfe5f/litellm/responses/litellm_completion_transformation/transformation.py#L57) |
+| `gemini` | [See supported parameters here](https://github.com/BerriAI/litellm/blob/f39d9178868662746f159d5ef642c7f34f9bfe5f/litellm/responses/litellm_completion_transformation/transformation.py#L57) |
+| `vertex_ai` | [See supported parameters here](https://github.com/BerriAI/litellm/blob/f39d9178868662746f159d5ef642c7f34f9bfe5f/litellm/responses/litellm_completion_transformation/transformation.py#L57) |
+| `azure_ai` | [See supported parameters here](https://github.com/BerriAI/litellm/blob/f39d9178868662746f159d5ef642c7f34f9bfe5f/litellm/responses/litellm_completion_transformation/transformation.py#L57) |
+| All other llm api providers | [See supported parameters here](https://github.com/BerriAI/litellm/blob/f39d9178868662746f159d5ef642c7f34f9bfe5f/litellm/responses/litellm_completion_transformation/transformation.py#L57) |
+
+## Load Balancing with Routing Affinity
+
+When using the Responses API with multiple deployments of the same model (e.g., multiple Azure OpenAI endpoints), LiteLLM provides routing affinity for conversations. This ensures that follow-up requests using a `previous_response_id` are routed to the same deployment that generated the original response.
+
+
+#### Example Usage
+
+<Tabs>
+<TabItem value="python-sdk" label="Python SDK">
+
+```python showLineNumbers title="Python SDK with Routing Affinity"
+import litellm
+
+# Set up router with multiple deployments of the same model
+router = litellm.Router(
+    model_list=[
+        {
+            "model_name": "azure-gpt4-turbo",
+            "litellm_params": {
+                "model": "azure/gpt-4-turbo",
+                "api_key": "your-api-key-1",
+                "api_version": "2024-06-01",
+                "api_base": "https://endpoint1.openai.azure.com",
+            },
+        },
+        {
+            "model_name": "azure-gpt4-turbo",
+            "litellm_params": {
+                "model": "azure/gpt-4-turbo",
+                "api_key": "your-api-key-2",
+                "api_version": "2024-06-01",
+                "api_base": "https://endpoint2.openai.azure.com",
+            },
+        },
+    ],
+    optional_pre_call_checks=["responses_api_deployment_check"],
+)
+
+# Initial request
+response = await router.aresponses(
+    model="azure-gpt4-turbo",
+    input="Hello, who are you?",
+    truncation="auto",
+)
+
+# Store the response ID
+response_id = response.id
+
+# Follow-up request - will be automatically routed to the same deployment
+follow_up = await router.aresponses(
+    model="azure-gpt4-turbo",
+    input="Tell me more about yourself",
+    truncation="auto",
+    previous_response_id=response_id  # This ensures routing to the same deployment
+)
+```
+
+</TabItem>
+<TabItem value="proxy-server" label="Proxy Server">
+
+#### 1. Setup routing affinity on proxy config.yaml
+
+To enable routing affinity for Responses API in your LiteLLM proxy, set `optional_pre_call_checks: ["responses_api_deployment_check"]` in your proxy config.yaml.
+
+```yaml showLineNumbers title="config.yaml with Responses API Routing Affinity"
+model_list:
+  - model_name: azure-gpt4-turbo
+    litellm_params:
+      model: azure/gpt-4-turbo
+      api_key: your-api-key-1
+      api_version: 2024-06-01
+      api_base: https://endpoint1.openai.azure.com
+  - model_name: azure-gpt4-turbo
+    litellm_params:
+      model: azure/gpt-4-turbo
+      api_key: your-api-key-2
+      api_version: 2024-06-01
+      api_base: https://endpoint2.openai.azure.com
+
+router_settings:
+  optional_pre_call_checks: ["responses_api_deployment_check"]
+```
+
+#### 2. Use the OpenAI Python SDK to make requests to LiteLLM Proxy
+
+```python showLineNumbers title="OpenAI Client with Proxy Server"
+from openai import OpenAI
+
+client = OpenAI(
+    base_url="http://localhost:4000",
+    api_key="your-api-key"
+)
+
+# Initial request
+response = client.responses.create(
+    model="azure-gpt4-turbo",
+    input="Hello, who are you?"
+)
+
+response_id = response.id
+
+# Follow-up request - will be automatically routed to the same deployment
+follow_up = client.responses.create(
+    model="azure-gpt4-turbo",
+    input="Tell me more about yourself",
+    previous_response_id=response_id  # This ensures routing to the same deployment
+)
+```
+
+</TabItem>
+</Tabs>
--- a/docs/my-website/docs/simple_proxy_old_doc.md
+++ b/docs/my-website/docs/simple_proxy_old_doc.md
@ -994,16 +994,16 @@ litellm --health

 ## Logging Proxy Input/Output - OpenTelemetry

-### Step 1 Start OpenTelemetry Collecter Docker Container
+### Step 1 Start OpenTelemetry Collector Docker Container
 This container sends logs to your selected destination 

-#### Install OpenTelemetry Collecter Docker Image
+#### Install OpenTelemetry Collector Docker Image
 ```shell
 docker pull otel/opentelemetry-collector:0.90.0
 docker run -p 127.0.0.1:4317:4317 -p 127.0.0.1:55679:55679 otel/opentelemetry-collector:0.90.0
 ```

-#### Set Destination paths on OpenTelemetry Collecter
+#### Set Destination paths on OpenTelemetry Collector

 Here's the OpenTelemetry yaml config to use with Elastic Search
 ```yaml
@ -1077,7 +1077,7 @@ general_settings:
 LiteLLM will read the `OTEL_ENDPOINT` environment variable to send data to your OTEL collector 

 ```python
-os.environ['OTEL_ENDPOINT'] # defauls to 127.0.0.1:4317 if not provided
+os.environ['OTEL_ENDPOINT'] # defaults to 127.0.0.1:4317 if not provided
 ```

 #### Start LiteLLM Proxy
@ -1101,8 +1101,8 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
 ```


-#### Test & View Logs on OpenTelemetry Collecter
-On successfull logging you should be able to see this log on your `OpenTelemetry Collecter` Docker Container
+#### Test & View Logs on OpenTelemetry Collector
+On successful logging you should be able to see this log on your `OpenTelemetry Collector` Docker Container
 ```shell
 Events:
 SpanEvent #0
@ -1149,7 +1149,7 @@ Here's the log view on Elastic Search. You can see the request `input`, `output`
 <Image img={require('../img/elastic_otel.png')} />

 ## Logging Proxy Input/Output - Langfuse
-We will use the `--config` to set `litellm.success_callback = ["langfuse"]` this will log all successfull LLM calls to langfuse
+We will use the `--config` to set `litellm.success_callback = ["langfuse"]` this will log all successful LLM calls to langfuse

 **Step 1** Install langfuse

--- a/docs/my-website/docs/tutorials/compare_llms.md
+++ b/docs/my-website/docs/tutorials/compare_llms.md
@ -117,7 +117,7 @@ response = completion("command-nightly", messages)
 """


-# qustions/logs you want to run the LLM on
+# questions/logs you want to run the LLM on
 questions = [
    "what is litellm?",
    "why should I use LiteLLM",
--- a/docs/my-website/docs/tutorials/gradio_integration.md
+++ b/docs/my-website/docs/tutorials/gradio_integration.md
@ -30,7 +30,7 @@ def inference(message, history):
            yield partial_message
    except Exception as e:
        print("Exception encountered:", str(e))
-        yield f"An Error occured please 'Clear' the error and try your question again"
+        yield f"An Error occurred please 'Clear' the error and try your question again"
 ```

 ### Define Chat Interface
--- a/docs/my-website/docs/tutorials/msft_sso.md
+++ b/docs/my-website/docs/tutorials/msft_sso.md
@ -0,0 +1,162 @@
+import Image from '@theme/IdealImage';
+
+# Microsoft SSO: Sync Groups, Members with LiteLLM
+
+Sync Microsoft SSO Groups, Members with LiteLLM Teams. 
+
+<Image img={require('../../img/litellm_entra_id.png')}  style={{ width: '800px', height: 'auto' }} />
+
+<br />
+<br />
+
+
+## Prerequisites
+
+- An Azure Entra ID account with administrative access
+- A LiteLLM Enterprise App set up in your Azure Portal
+- Access to Microsoft Entra ID (Azure AD)
+
+
+## Overview of this tutorial
+
+1. Auto-Create Entra ID Groups on LiteLLM Teams 
+2. Sync Entra ID Team Memberships
+3. Set default params for new teams and users auto-created on LiteLLM
+
+## 1. Auto-Create Entra ID Groups on LiteLLM Teams 
+
+In this step, our goal is to have LiteLLM automatically create a new team on the LiteLLM DB when there is a new Group Added to the LiteLLM Enterprise App on Azure Entra ID.
+
+### 1.1 Create a new group in Entra ID
+
+
+Navigate to [your Azure Portal](https://portal.azure.com/) > Groups > New Group. Create a new group. 
+
+<Image img={require('../../img/entra_create_team.png')}  style={{ width: '800px', height: 'auto' }} />
+
+### 1.2 Assign the group to your LiteLLM Enterprise App
+
+On your Azure Portal, navigate to `Enterprise Applications` > Select your litellm app 
+
+<Image img={require('../../img/msft_enterprise_app.png')}  style={{ width: '800px', height: 'auto' }} />
+
+<br />
+<br />
+
+Once you've selected your litellm app, click on `Users and Groups` > `Add user/group` 
+
+<Image img={require('../../img/msft_enterprise_assign_group.png')}  style={{ width: '800px', height: 'auto' }} />
+
+<br />
+
+Now select the group you created in step 1.1. And add it to the LiteLLM Enterprise App. At this point we have added `Production LLM Evals Group` to the LiteLLM Enterprise App. The next steps is having LiteLLM automatically create the `Production LLM Evals Group` on the LiteLLM DB when a new user signs in.
+
+<Image img={require('../../img/msft_enterprise_select_group.png')}  style={{ width: '800px', height: 'auto' }} />
+
+
+### 1.3 Sign in to LiteLLM UI via SSO
+
+Sign into the LiteLLM UI via SSO. You should be redirected to the Entra ID SSO page. This SSO sign in flow will trigger LiteLLM to fetch the latest Groups and Members from Azure Entra ID.
+
+<Image img={require('../../img/msft_sso_sign_in.png')}  style={{ width: '800px', height: 'auto' }} />
+
+### 1.4 Check the new team on LiteLLM UI
+
+On the LiteLLM UI, Navigate to `Teams`, You should see the new team `Production LLM Evals Group` auto-created on LiteLLM. 
+
+<Image img={require('../../img/msft_auto_team.png')}  style={{ width: '900px', height: 'auto' }} />
+
+#### How this works
+
+When a SSO user signs in to LiteLLM:
+- LiteLLM automatically fetches the Groups under the LiteLLM Enterprise App
+- It finds the Production LLM Evals Group assigned to the LiteLLM Enterprise App
+- LiteLLM checks if this group's ID exists in the LiteLLM Teams Table
+- Since the ID doesn't exist, LiteLLM automatically creates a new team with:
+  - Name: Production LLM Evals Group
+  - ID: Same as the Entra ID group's ID
+
+## 2. Sync Entra ID Team Memberships
+
+In this step, we will have LiteLLM automatically add a user to the `Production LLM Evals` Team on the LiteLLM DB when a new user is added to the `Production LLM Evals` Group in Entra ID.
+
+### 2.1 Navigate to the `Production LLM Evals` Group in Entra ID
+
+Navigate to the `Production LLM Evals` Group in Entra ID.
+
+<Image img={require('../../img/msft_member_1.png')}  style={{ width: '800px', height: 'auto' }} />
+
+
+### 2.2 Add a member to the group in Entra ID
+
+Select `Members` > `Add members`
+
+In this stage you should add the user you want to add to the `Production LLM Evals` Team.
+
+<Image img={require('../../img/msft_member_2.png')}  style={{ width: '800px', height: 'auto' }} />
+
+
+
+### 2.3 Sign in as the new user on LiteLLM UI
+
+Sign in as the new user on LiteLLM UI. You should be redirected to the Entra ID SSO page. This SSO sign in flow will trigger LiteLLM to fetch the latest Groups and Members from Azure Entra ID. During this step LiteLLM sync it's teams, team members with what is available from Entra ID
+
+<Image img={require('../../img/msft_sso_sign_in.png')}  style={{ width: '800px', height: 'auto' }} />
+
+
+
+### 2.4 Check the team membership on LiteLLM UI
+
+On the LiteLLM UI, Navigate to `Teams`, You should see the new team `Production LLM Evals Group`. Since your are now a member of the `Production LLM Evals Group` in Entra ID, you should see the new team `Production LLM Evals Group` on the LiteLLM UI.
+
+<Image img={require('../../img/msft_member_3.png')}  style={{ width: '900px', height: 'auto' }} />
+
+## 3. Set default params for new teams auto-created on LiteLLM
+
+Since litellm auto creates a new team on the LiteLLM DB when there is a new Group Added to the LiteLLM Enterprise App on Azure Entra ID, we can set default params for new teams created. 
+
+This allows you to set a default budget, models, etc for new teams created. 
+
+### 3.1 Set `default_team_params` on litellm 
+
+Navigate to your litellm config file and set the following params 
+
+```yaml showLineNumbers title="litellm config with default_team_params"
+litellm_settings:
+  default_team_params:             # Default Params to apply when litellm auto creates a team from SSO IDP provider
+    max_budget: 100                # Optional[float], optional): $100 budget for the team
+    budget_duration: 30d           # Optional[str], optional): 30 days budget_duration for the team
+    models: ["gpt-3.5-turbo"]      # Optional[List[str]], optional): models to be used by the team
+```
+
+### 3.2 Auto-create a new team on LiteLLM
+
+- In this step you should add a new group to the LiteLLM Enterprise App on Azure Entra ID (like we did in step 1.1). We will call this group `Default LiteLLM Prod Team` on Azure Entra ID.
+- Start litellm proxy server with your config
+- Sign into LiteLLM UI via SSO
+- Navigate to `Teams` and you should see the new team `Default LiteLLM Prod Team` auto-created on LiteLLM
+- Note LiteLLM will set the default params for this new team. 
+
+<Image img={require('../../img/msft_default_settings.png')}  style={{ width: '900px', height: 'auto' }} />
+
+
+## Video Walkthrough
+
+This walks through setting up sso auto-add for **Microsoft Entra ID**
+
+Follow along this video for a walkthrough of how to set this up with Microsoft Entra ID
+
+<iframe width="840" height="500" src="https://www.loom.com/embed/ea711323aa9a496d84a01fd7b2a12f54?sid=c53e238c-5bfd-4135-b8fb-b5b1a08632cf" frameborder="0" webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/docs/my-website/docs/tutorials/openai_codex.md
+++ b/docs/my-website/docs/tutorials/openai_codex.md
@ -0,0 +1,146 @@
+import Image from '@theme/IdealImage';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Using LiteLLM with OpenAI Codex
+
+This guide walks you through connecting OpenAI Codex to LiteLLM. Using LiteLLM with Codex allows teams to:
+- Access 100+ LLMs through the Codex interface
+- Use powerful models like Gemini through a familiar interface
+- Track spend and usage with LiteLLM's built-in analytics
+- Control model access with virtual keys
+
+<Image img={require('../../img/litellm_codex.gif')} />
+
+## Quickstart
+
+:::info
+
+Requires LiteLLM v1.66.3.dev5 and higher
+
+:::
+
+
+Make sure to set up LiteLLM with the [LiteLLM Getting Started Guide](../proxy/docker_quick_start.md).
+
+## 1. Install OpenAI Codex
+
+Install the OpenAI Codex CLI tool globally using npm:
+
+<Tabs>
+<TabItem value="npm" label="npm">
+
+```bash showLineNumbers
+npm i -g @openai/codex
+```
+
+</TabItem>
+<TabItem value="yarn" label="yarn">
+
+```bash showLineNumbers
+yarn global add @openai/codex
+```
+
+</TabItem>
+</Tabs>
+
+## 2. Start LiteLLM Proxy
+
+<Tabs>
+<TabItem value="docker" label="Docker">
+
+```bash showLineNumbers
+docker run \
+    -v $(pwd)/litellm_config.yaml:/app/config.yaml \
+    -p 4000:4000 \
+    ghcr.io/berriai/litellm:main-latest \
+    --config /app/config.yaml
+```
+
+</TabItem>
+<TabItem value="pip" label="LiteLLM CLI">
+
+```bash showLineNumbers
+litellm --config /path/to/config.yaml
+```
+
+</TabItem>
+</Tabs>
+
+LiteLLM should now be running on [http://localhost:4000](http://localhost:4000)
+
+## 3. Configure LiteLLM for Model Routing
+
+Ensure your LiteLLM Proxy is properly configured to route to your desired models. Create a `litellm_config.yaml` file with the following content:
+
+```yaml showLineNumbers
+model_list:
+  - model_name: o3-mini
+    litellm_params:
+      model: openai/o3-mini
+      api_key: os.environ/OPENAI_API_KEY
+  - model_name: claude-3-7-sonnet-latest
+    litellm_params:
+      model: anthropic/claude-3-7-sonnet-latest
+      api_key: os.environ/ANTHROPIC_API_KEY
+  - model_name: gemini-2.0-flash
+    litellm_params:
+      model: gemini/gemini-2.0-flash
+      api_key: os.environ/GEMINI_API_KEY
+
+litellm_settings:
+  drop_params: true
+```
+
+This configuration enables routing to specific OpenAI, Anthropic, and Gemini models with explicit names.
+
+## 4. Configure Codex to Use LiteLLM Proxy
+
+Set the required environment variables to point Codex to your LiteLLM Proxy:
+
+```bash
+# Point to your LiteLLM Proxy server
+export OPENAI_BASE_URL=http://0.0.0.0:4000 
+
+# Use your LiteLLM API key (if you've set up authentication)
+export OPENAI_API_KEY="sk-1234"
+```
+
+## 5. Run Codex with Gemini
+
+With everything configured, you can now run Codex with Gemini:
+
+```bash showLineNumbers
+codex --model gemini-2.0-flash --full-auto
+```
+
+<Image img={require('../../img/litellm_codex.gif')} />
+
+The `--full-auto` flag allows Codex to automatically generate code without additional prompting.
+
+## 6. Advanced Options
+
+### Using Different Models
+
+You can use any model configured in your LiteLLM proxy:
+
+```bash
+# Use Claude models
+codex --model claude-3-7-sonnet-latest
+
+# Use Google AI Studio Gemini models
+codex --model gemini/gemini-2.0-flash
+```
+
+## Troubleshooting
+
+- If you encounter connection issues, ensure your LiteLLM Proxy is running and accessible at the specified URL
+- Verify your LiteLLM API key is valid if you're using authentication
+- Check that your model routing configuration is correct
+- For model-specific errors, ensure the model is properly configured in your LiteLLM setup
+
+## Additional Resources
+
+- [LiteLLM Docker Quick Start Guide](../proxy/docker_quick_start.md)
+- [OpenAI Codex GitHub Repository](https://github.com/openai/codex)
+- [LiteLLM Virtual Keys and Authentication](../proxy/virtual_keys.md)
--- a/docs/my-website/docs/tutorials/prompt_caching.md
+++ b/docs/my-website/docs/tutorials/prompt_caching.md
@ -0,0 +1,128 @@
+import Image from '@theme/IdealImage';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Auto-Inject Prompt Caching Checkpoints
+
+Reduce costs by up to 90% by using LiteLLM to auto-inject prompt caching checkpoints.
+
+<Image img={require('../../img/auto_prompt_caching.png')}  style={{ width: '800px', height: 'auto' }} />
+
+
+## How it works
+
+LiteLLM can automatically inject prompt caching checkpoints into your requests to LLM providers. This allows:
+
+- **Cost Reduction**: Long, static parts of your prompts can be cached to avoid repeated processing
+- **No need to modify your application code**: You can configure the auto-caching behavior in the LiteLLM UI or in the `litellm config.yaml` file.
+
+## Configuration
+
+You need to specify `cache_control_injection_points` in your model configuration. This tells LiteLLM:
+1. Where to add the caching directive (`location`)
+2. Which message to target (`role`)
+
+LiteLLM will then automatically add a `cache_control` directive to the specified messages in your requests:
+
+```json
+"cache_control": {
+    "type": "ephemeral"
+}
+```
+
+## Usage Example 
+
+In this example, we'll configure caching for system messages by adding the directive to all messages with `role: system`.
+
+<Tabs>
+<TabItem value="litellm config.yaml" label="litellm config.yaml">
+
+```yaml showLineNumbers title="litellm config.yaml"
+model_list:
+  - model_name: anthropic-auto-inject-cache-system-message
+    litellm_params:
+      model: anthropic/claude-3-5-sonnet-20240620
+      api_key: os.environ/ANTHROPIC_API_KEY
+      cache_control_injection_points:
+        - location: message
+          role: system
+```
+</TabItem>
+
+<TabItem value="UI" label="LiteLLM UI">
+
+On the LiteLLM UI, you can specify the `cache_control_injection_points` in the `Advanced Settings` tab when adding a model.
+<Image img={require('../../img/ui_auto_prompt_caching.png')}/>
+
+</TabItem>
+</Tabs>
+
+
+## Detailed Example
+
+### 1. Original Request to LiteLLM 
+
+In this example, we have a very long, static system message and a varying user message. It's efficient to cache the system message since it rarely changes.
+
+```json
+{
+    "messages": [
+        {
+            "role": "system",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "You are a helpful assistant. This is a set of very long instructions that you will follow. Here is a legal document that you will use to answer the user's question."
+                }
+            ]
+        },
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What is the main topic of this legal document?"
+                }
+            ]
+        }
+    ]
+}
+```
+
+### 2. LiteLLM's Modified Request
+
+LiteLLM auto-injects the caching directive into the system message based on our configuration:
+
+```json
+{
+    "messages": [
+        {
+            "role": "system",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "You are a helpful assistant. This is a set of very long instructions that you will follow. Here is a legal document that you will use to answer the user's question.",
+                    "cache_control": {"type": "ephemeral"}
+                }
+            ]
+        },
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What is the main topic of this legal document?"
+                }
+            ]
+        }
+    ]
+}
+```
+
+When the model provider processes this request, it will recognize the caching directive and only process the system message once, caching it for subsequent requests.
+
+
+    
+
+
+
--- a/docs/my-website/docs/tutorials/scim_litellm.md
+++ b/docs/my-website/docs/tutorials/scim_litellm.md
@ -0,0 +1,74 @@
+
+import Image from '@theme/IdealImage';
+
+# SCIM with LiteLLM
+
+Enables identity providers (Okta, Azure AD, OneLogin, etc.) to automate user and team (group) provisioning, updates, and deprovisioning on LiteLLM.
+
+
+This tutorial will walk you through the steps to connect your IDP to LiteLLM SCIM Endpoints.
+
+### Supported SSO Providers for SCIM
+Below is a list of supported SSO providers for connecting to LiteLLM SCIM Endpoints.
+- Microsoft Entra ID (Azure AD)
+- Okta
+- Google Workspace
+- OneLogin
+- Keycloak
+- Auth0
+
+
+## 1. Get your SCIM Tenant URL and Bearer Token
+
+On LiteLLM, navigate to the Settings > Admin Settings > SCIM. On this page you will create a SCIM Token, this allows your IDP to authenticate to litellm `/scim` endpoints.
+
+<Image img={require('../../img/scim_2.png')}  style={{ width: '800px', height: 'auto' }} />
+
+## 2. Connect your IDP to LiteLLM SCIM Endpoints
+
+On your IDP provider, navigate to your SSO application and select `Provisioning` > `New provisioning configuration`.
+
+On this page, paste in your litellm scim tenant url and bearer token.
+
+Once this is pasted in, click on `Test Connection` to ensure your IDP can authenticate to the LiteLLM SCIM endpoints.
+
+<Image img={require('../../img/scim_4.png')}  style={{ width: '800px', height: 'auto' }} />
+
+
+## 3. Test SCIM Connection
+
+### 3.1 Assign the group to your LiteLLM Enterprise App
+
+On your IDP Portal, navigate to `Enterprise Applications` > Select your litellm app 
+
+<Image img={require('../../img/msft_enterprise_app.png')}  style={{ width: '800px', height: 'auto' }} />
+
+<br />
+<br />
+
+Once you've selected your litellm app, click on `Users and Groups` > `Add user/group` 
+
+<Image img={require('../../img/msft_enterprise_assign_group.png')}  style={{ width: '800px', height: 'auto' }} />
+
+<br />
+
+Now select the group you created in step 1.1. And add it to the LiteLLM Enterprise App. At this point we have added `Production LLM Evals Group` to the LiteLLM Enterprise App. The next step is having LiteLLM automatically create the `Production LLM Evals Group` on the LiteLLM DB when a new user signs in.
+
+<Image img={require('../../img/msft_enterprise_select_group.png')}  style={{ width: '800px', height: 'auto' }} />
+
+
+### 3.2 Sign in to LiteLLM UI via SSO
+
+Sign into the LiteLLM UI via SSO. You should be redirected to the Entra ID SSO page. This SSO sign in flow will trigger LiteLLM to fetch the latest Groups and Members from Azure Entra ID.
+
+<Image img={require('../../img/msft_sso_sign_in.png')}  style={{ width: '800px', height: 'auto' }} />
+
+### 3.3 Check the new team on LiteLLM UI
+
+On the LiteLLM UI, Navigate to `Teams`, You should see the new team `Production LLM Evals Group` auto-created on LiteLLM. 
+
+<Image img={require('../../img/msft_auto_team.png')}  style={{ width: '900px', height: 'auto' }} />
+
+
+
+
--- a/docs/my-website/docs/tutorials/tag_management.md
+++ b/docs/my-website/docs/tutorials/tag_management.md
@ -0,0 +1,145 @@
+import Image from '@theme/IdealImage';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# [Beta] Routing based on request metadata
+
+Create routing rules based on request metadata.
+
+## Setup
+
+Add the following to your litellm proxy config yaml file.
+
+```yaml showLineNumbers title="litellm proxy config.yaml"
+router_settings:
+  enable_tag_filtering: True # 👈 Key Change
+```
+
+## 1. Create a tag
+
+On the LiteLLM UI, navigate to Experimental > Tag Management > Create Tag.
+
+Create a tag called `private-data` and only select the allowed models for requests with this tag. Once created, you will see the tag in the Tag Management page.
+
+<Image img={require('../../img/tag_create.png')}  style={{ width: '800px', height: 'auto' }} />
+
+
+## 2. Test Tag Routing
+
+Now we will test the tag based routing rules.
+
+### 2.1 Invalid model
+
+This request will fail since we send `tags=private-data` but the model `gpt-4o` is not in the allowed models for the `private-data` tag.
+
+<Image img={require('../../img/tag_invalid.png')}  style={{ width: '800px', height: 'auto' }} />
+
+<br />
+
+Here is an example sending the same request using the OpenAI Python SDK.
+<Tabs>
+<TabItem value="python" label="OpenAI Python SDK">
+
+```python showLineNumbers
+from openai import OpenAI
+
+client = OpenAI(
+    api_key="sk-1234",
+    base_url="http://0.0.0.0:4000/v1/"
+)
+
+response = client.chat.completions.create(
+    model="gpt-4o",
+    messages=[
+        {"role": "user", "content": "Hello, how are you?"}
+    ],
+    extra_body={
+        "tags": "private-data"
+    }
+)
+```
+
+</TabItem>
+<TabItem value="curl" label="cURL">
+
+```bash
+curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+  "model": "gpt-4o",
+  "messages": [
+    {
+      "role": "user",
+      "content": "Hello, how are you?"
+    }
+  ],
+  "tags": "private-data"
+}'
+```
+
+</TabItem>
+</Tabs>
+
+<br />
+
+### 2.2 Valid model
+
+This request will succeed since we send `tags=private-data` and the model `us.anthropic.claude-3-7-sonnet-20250219-v1:0` is in the allowed models for the `private-data` tag.
+
+<Image img={require('../../img/tag_valid.png')}  style={{ width: '800px', height: 'auto' }} />
+
+Here is an example sending the same request using the OpenAI Python SDK.
+
+<Tabs>
+<TabItem value="python" label="OpenAI Python SDK">
+
+```python showLineNumbers
+from openai import OpenAI
+
+client = OpenAI(
+    api_key="sk-1234",
+    base_url="http://0.0.0.0:4000/v1/"
+)
+
+response = client.chat.completions.create(
+    model="us.anthropic.claude-3-7-sonnet-20250219-v1:0",
+    messages=[
+        {"role": "user", "content": "Hello, how are you?"}
+    ],
+    extra_body={
+        "tags": "private-data"
+    }
+)
+```
+
+</TabItem>
+<TabItem value="curl" label="cURL">
+
+```bash
+curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+  "model": "us.anthropic.claude-3-7-sonnet-20250219-v1:0",
+  "messages": [
+    {
+      "role": "user",
+      "content": "Hello, how are you?"
+    }
+  ],
+  "tags": "private-data"
+}'
+```
+
+</TabItem>
+</Tabs>
+
+
+
+## Additional Tag Features
+- [Sending tags in request headers](https://docs.litellm.ai/docs/proxy/tag_routing#calling-via-request-header)
+- [Tag based routing](https://docs.litellm.ai/docs/proxy/tag_routing)
+- [Track spend per tag](cost_tracking#-custom-tags)
+- [Setup Budgets per Virtual Key, Team](users)
+
--- a/docs/my-website/img/auto_prompt_caching.png
+++ b/docs/my-website/img/auto_prompt_caching.png
--- a/docs/my-website/img/debug_sso.png
+++ b/docs/my-website/img/debug_sso.png
--- a/docs/my-website/img/entra_create_team.png
+++ b/docs/my-website/img/entra_create_team.png
--- a/docs/my-website/img/hf_filter_inference_providers.png
+++ b/docs/my-website/img/hf_filter_inference_providers.png
--- a/docs/my-website/img/litellm_codex.gif
+++ b/docs/my-website/img/litellm_codex.gif
--- a/docs/my-website/img/litellm_entra_id.png
+++ b/docs/my-website/img/litellm_entra_id.png
--- a/docs/my-website/img/managed_files_arch.png
+++ b/docs/my-website/img/managed_files_arch.png
--- a/docs/my-website/img/msft_auto_team.png
+++ b/docs/my-website/img/msft_auto_team.png
--- a/docs/my-website/img/msft_default_settings.png
+++ b/docs/my-website/img/msft_default_settings.png
--- a/docs/my-website/img/msft_enterprise_app.png
+++ b/docs/my-website/img/msft_enterprise_app.png
--- a/docs/my-website/img/msft_enterprise_assign_group.png
+++ b/docs/my-website/img/msft_enterprise_assign_group.png
--- a/docs/my-website/img/msft_enterprise_select_group.png
+++ b/docs/my-website/img/msft_enterprise_select_group.png
--- a/docs/my-website/img/msft_member_1.png
+++ b/docs/my-website/img/msft_member_1.png
--- a/docs/my-website/img/msft_member_2.png
+++ b/docs/my-website/img/msft_member_2.png
--- a/docs/my-website/img/msft_member_3.png
+++ b/docs/my-website/img/msft_member_3.png
--- a/docs/my-website/img/msft_sso_sign_in.png
+++ b/docs/my-website/img/msft_sso_sign_in.png
--- a/docs/my-website/img/prevent_deadlocks.jpg
+++ b/docs/my-website/img/prevent_deadlocks.jpg
--- a/docs/my-website/img/realtime_api.png
+++ b/docs/my-website/img/realtime_api.png
--- a/docs/my-website/img/release_notes/chat_metrics.png
+++ b/docs/my-website/img/release_notes/chat_metrics.png
--- a/docs/my-website/img/release_notes/new_activity_tab.png
+++ b/docs/my-website/img/release_notes/new_activity_tab.png
--- a/docs/my-website/img/release_notes/new_tag_usage.png
+++ b/docs/my-website/img/release_notes/new_tag_usage.png
--- a/docs/my-website/img/release_notes/new_team_usage.png
+++ b/docs/my-website/img/release_notes/new_team_usage.png
--- a/docs/my-website/img/release_notes/new_team_usage_highlight.jpg
+++ b/docs/my-website/img/release_notes/new_team_usage_highlight.jpg
--- a/docs/my-website/img/release_notes/spend_by_model.jpg
+++ b/docs/my-website/img/release_notes/spend_by_model.jpg
--- a/docs/my-website/img/release_notes/sso_sync.png
+++ b/docs/my-website/img/release_notes/sso_sync.png
--- a/docs/my-website/img/release_notes/tag_management.png
+++ b/docs/my-website/img/release_notes/tag_management.png
--- a/docs/my-website/img/release_notes/unified_responses_api_rn.png
+++ b/docs/my-website/img/release_notes/unified_responses_api_rn.png
--- a/docs/my-website/img/scim_0.png
+++ b/docs/my-website/img/scim_0.png
--- a/docs/my-website/img/scim_1.png
+++ b/docs/my-website/img/scim_1.png
--- a/docs/my-website/img/scim_2.png
+++ b/docs/my-website/img/scim_2.png
--- a/docs/my-website/img/scim_3.png
+++ b/docs/my-website/img/scim_3.png
--- a/docs/my-website/img/scim_4.png
+++ b/docs/my-website/img/scim_4.png
--- a/docs/my-website/img/scim_integration.png
+++ b/docs/my-website/img/scim_integration.png
--- a/docs/my-website/img/tag_create.png
+++ b/docs/my-website/img/tag_create.png
--- a/Show more
+++ b/Show more