Merge branch 'BerriAI:main' into fix-sqli

2025-04-24 18:24:20 +00:00 · 2025-04-21 20:35:55 +04:00 · 2025-04-21 20:35:55 +04:00 · 3b1a13d440
commit 3b1a13d440
parent 1894b34650 10257426a2
393 changed files with 19184 additions and 5317 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -2390,6 +2390,114 @@ jobs:
            echo "triggering load testing server for version ${VERSION} and commit ${CIRCLE_SHA1}"
            curl -X POST "https://proxyloadtester-production.up.railway.app/start/load/test?version=${VERSION}&commit_hash=${CIRCLE_SHA1}&release_type=nightly"

+  publish_proxy_extras:
+    docker:
+      - image: cimg/python:3.8
+    working_directory: ~/project/litellm-proxy-extras
+    environment:
+      TWINE_USERNAME: __token__
+
+    steps:
+      - checkout:
+          path: ~/project
+
+      - run:
+          name: Check if litellm-proxy-extras dir or pyproject.toml was modified
+          command: |
+            echo "Install TOML package."
+            python -m pip install toml
+            # Get current version from pyproject.toml
+            CURRENT_VERSION=$(python -c "import toml; print(toml.load('pyproject.toml')['tool']['poetry']['version'])")
+            
+            # Get last published version from PyPI
+            LAST_VERSION=$(curl -s https://pypi.org/pypi/litellm-proxy-extras/json | python -c "import json, sys; print(json.load(sys.stdin)['info']['version'])")
+            
+            echo "Current version: $CURRENT_VERSION"
+            echo "Last published version: $LAST_VERSION"
+            
+            # Compare versions using Python's packaging.version
+            VERSION_COMPARE=$(python -c "from packaging import version; print(1 if version.parse('$CURRENT_VERSION') < version.parse('$LAST_VERSION') else 0)")
+            
+            echo "Version compare: $VERSION_COMPARE"
+            if [ "$VERSION_COMPARE" = "1" ]; then
+              echo "Error: Current version ($CURRENT_VERSION) is less than last published version ($LAST_VERSION)"
+              exit 1
+            fi
+            
+            # If versions are equal or current is greater, check contents
+            pip download --no-deps litellm-proxy-extras==$LAST_VERSION -d /tmp
+            
+            echo "Contents of /tmp directory:"
+            ls -la /tmp
+            
+            # Find the downloaded file (could be .whl or .tar.gz)
+            DOWNLOADED_FILE=$(ls /tmp/litellm_proxy_extras-*)
+            echo "Downloaded file: $DOWNLOADED_FILE"
+            
+            # Extract based on file extension
+            if [[ "$DOWNLOADED_FILE" == *.whl ]]; then
+                echo "Extracting wheel file..."
+                unzip -q "$DOWNLOADED_FILE" -d /tmp/extracted
+                EXTRACTED_DIR="/tmp/extracted"
+            else
+                echo "Extracting tar.gz file..."
+                tar -xzf "$DOWNLOADED_FILE" -C /tmp
+                EXTRACTED_DIR="/tmp/litellm_proxy_extras-$LAST_VERSION"
+            fi
+            
+            echo "Contents of extracted package:"
+            ls -R "$EXTRACTED_DIR"
+            
+            # Compare contents
+            if ! diff -r "$EXTRACTED_DIR/litellm_proxy_extras" ./litellm_proxy_extras; then
+              if [ "$CURRENT_VERSION" = "$LAST_VERSION" ]; then
+                echo "Error: Changes detected in litellm-proxy-extras but version was not bumped"
+                echo "Current version: $CURRENT_VERSION"
+                echo "Last published version: $LAST_VERSION"
+                echo "Changes:"
+                diff -r "$EXTRACTED_DIR/litellm_proxy_extras" ./litellm_proxy_extras
+                exit 1
+              fi
+            else
+              echo "No changes detected in litellm-proxy-extras. Skipping PyPI publish."
+              circleci step halt
+            fi
+
+      - run:
+          name: Get new version
+          command: |
+            cd litellm-proxy-extras
+            NEW_VERSION=$(python -c "import toml; print(toml.load('pyproject.toml')['tool']['poetry']['version'])")
+            echo "export NEW_VERSION=$NEW_VERSION" >> $BASH_ENV
+
+      - run:
+          name: Check if versions match
+          command: |
+            cd ~/project
+            # Check pyproject.toml
+            CURRENT_VERSION=$(python -c "import toml; print(toml.load('pyproject.toml')['tool']['poetry']['dependencies']['litellm-proxy-extras'].split('\"')[1])")
+            if [ "$CURRENT_VERSION" != "$NEW_VERSION" ]; then
+              echo "Error: Version in pyproject.toml ($CURRENT_VERSION) doesn't match new version ($NEW_VERSION)"
+              exit 1
+            fi
+
+            # Check requirements.txt
+            REQ_VERSION=$(grep -oP 'litellm-proxy-extras==\K[0-9.]+' requirements.txt)
+            if [ "$REQ_VERSION" != "$NEW_VERSION" ]; then
+              echo "Error: Version in requirements.txt ($REQ_VERSION) doesn't match new version ($NEW_VERSION)"
+              exit 1
+            fi
+
+      - run:
+          name: Publish to PyPI
+          command: |
+            cd litellm-proxy-extras
+            echo -e "[pypi]\nusername = $PYPI_PUBLISH_USERNAME\npassword = $PYPI_PUBLISH_PASSWORD" > ~/.pypirc
+            python -m pip install --upgrade pip build twine setuptools wheel
+            rm -rf build dist
+            python -m build
+            twine upload --verbose dist/*
+
  e2e_ui_testing:
    machine:
      image: ubuntu-2204:2023.10.1
@ -2785,6 +2893,11 @@ workflows:
              only:
                - main
                - /litellm_.*/
+      - publish_proxy_extras:
+          filters:
+            branches:
+              only:
+                - main
      - publish_to_pypi:
          requires:
            - local_testing
@ -2819,7 +2932,5 @@ workflows:
            - proxy_build_from_pip_tests
            - proxy_pass_through_endpoint_tests
            - check_code_and_doc_quality
-          filters:
-            branches:
-              only:
-                - main
+            - publish_proxy_extras
+      
--- a/.gitignore
+++ b/.gitignore
@ -73,6 +73,7 @@ tests/local_testing/log.txt
 .codegpt
 litellm/proxy/_new_new_secret_config.yaml
 litellm/proxy/custom_guardrail.py
+.mypy_cache/*
 litellm/proxy/_experimental/out/404.html
 litellm/proxy/_experimental/out/404.html
 litellm/proxy/_experimental/out/model_hub.html
@ -85,3 +86,4 @@ litellm/proxy/db/migrations/0_init/migration.sql
 litellm/proxy/db/migrations/*
 litellm/proxy/migrations/*config.yaml
 litellm/proxy/migrations/*
+tests/litellm/litellm_core_utils/llm_cost_calc/log.txt
--- a/docs/my-website/docs/files_endpoints.md
+++ b/docs/my-website/docs/files_endpoints.md
@ -2,10 +2,12 @@
 import TabItem from '@theme/TabItem';
 import Tabs from '@theme/Tabs';

-# /files
+# Provider Files Endpoints

 Files are used to upload documents that can be used with features like Assistants, Fine-tuning, and Batch API.

+Use this to call the provider's `/files` endpoints directly, in the OpenAI format. 
+
 ## Quick Start

 - Upload a File
@ -19,7 +21,7 @@ Files are used to upload documents that can be used with features like Assistant
 <Tabs>
 <TabItem value="proxy" label="LiteLLM PROXY Server">

-### 1. Setup config.yaml
+1. Setup config.yaml

 ```
 # for /files endpoints
@ -32,7 +34,7 @@ files_settings:
    api_key: os.environ/OPENAI_API_KEY
 ```

-### 2. Start LiteLLM PROXY Server
+2. Start LiteLLM PROXY Server

 ```bash
 litellm --config /path/to/config.yaml
@ -40,7 +42,7 @@ litellm --config /path/to/config.yaml
 ## RUNNING on http://0.0.0.0:4000
 ```

-### 3. Use OpenAI's /files endpoints
+3. Use OpenAI's /files endpoints

 Upload a File

--- a/docs/my-website/docs/pass_through/cohere.md
+++ b/docs/my-website/docs/pass_through/cohere.md
@ -4,7 +4,7 @@ Pass-through endpoints for Cohere - call provider-specific endpoint, in native f

 | Feature | Supported | Notes | 
 |-------|-------|-------|
-| Cost Tracking | ❌ | [Tell us if you need this](https://github.com/BerriAI/litellm/issues/new) |
+| Cost Tracking | ✅ | Supported for `/v1/chat`, and `/v2/chat` |
 | Logging | ✅ | works across all integrations |
 | End-user Tracking | ❌ | [Tell us if you need this](https://github.com/BerriAI/litellm/issues/new) |
 | Streaming | ✅ | |
--- a/docs/my-website/docs/pass_through/mistral.md
+++ b/docs/my-website/docs/pass_through/mistral.md
@ -0,0 +1,217 @@
+# Mistral
+
+Pass-through endpoints for Mistral - call provider-specific endpoint, in native format (no translation).
+
+| Feature | Supported | Notes | 
+|-------|-------|-------|
+| Cost Tracking | ❌ | Not supported |
+| Logging | ✅ | works across all integrations |
+| End-user Tracking | ❌ | [Tell us if you need this](https://github.com/BerriAI/litellm/issues/new) |
+| Streaming | ✅ | |
+
+Just replace `https://api.mistral.ai/v1` with `LITELLM_PROXY_BASE_URL/mistral` 🚀
+
+#### **Example Usage**
+
+```bash
+curl -L -X POST 'http://0.0.0.0:4000/mistral/v1/ocr' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+    "model": "mistral-ocr-latest",
+    "document": {
+        "type": "image_url",
+        "image_url": "https://raw.githubusercontent.com/mistralai/cookbook/refs/heads/main/mistral/ocr/receipt.png"
+    }
+
+}'
+```
+
+Supports **ALL** Mistral Endpoints (including streaming).
+
+## Quick Start
+
+Let's call the Mistral [`/chat/completions` endpoint](https://docs.mistral.ai/api/#tag/chat/operation/chat_completion_v1_chat_completions_post)
+
+1. Add MISTRAL_API_KEY to your environment 
+
+```bash
+export MISTRAL_API_KEY="sk-1234"
+```
+
+2. Start LiteLLM Proxy 
+
+```bash
+litellm
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+3. Test it! 
+
+Let's call the Mistral `/ocr` endpoint
+
+```bash
+curl -L -X POST 'http://0.0.0.0:4000/mistral/v1/ocr' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+    "model": "mistral-ocr-latest",
+    "document": {
+        "type": "image_url",
+        "image_url": "https://raw.githubusercontent.com/mistralai/cookbook/refs/heads/main/mistral/ocr/receipt.png"
+    }
+
+}'
+```
+
+
+## Examples
+
+Anything after `http://0.0.0.0:4000/mistral` is treated as a provider-specific route, and handled accordingly.
+
+Key Changes: 
+
+| **Original Endpoint**                                | **Replace With**                  |
+|------------------------------------------------------|-----------------------------------|
+| `https://api.mistral.ai/v1`          | `http://0.0.0.0:4000/mistral` (LITELLM_PROXY_BASE_URL="http://0.0.0.0:4000")      |
+| `bearer $MISTRAL_API_KEY`                                 | `bearer anything` (use `bearer LITELLM_VIRTUAL_KEY` if Virtual Keys are setup on proxy)                    |
+
+
+### **Example 1: OCR endpoint**
+
+#### LiteLLM Proxy Call 
+
+```bash
+curl -L -X POST 'http://0.0.0.0:4000/mistral/v1/ocr' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer $LITELLM_API_KEY' \
+-d '{
+    "model": "mistral-ocr-latest",
+    "document": {
+        "type": "image_url",
+        "image_url": "https://raw.githubusercontent.com/mistralai/cookbook/refs/heads/main/mistral/ocr/receipt.png"
+    }
+}'
+```
+
+
+#### Direct Mistral API Call 
+
+```bash
+curl https://api.mistral.ai/v1/ocr \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer ${MISTRAL_API_KEY}" \
+  -d '{
+    "model": "mistral-ocr-latest",
+    "document": {
+        "type": "document_url",
+        "document_url": "https://arxiv.org/pdf/2201.04234"
+    },
+    "include_image_base64": true
+  }'
+```
+
+### **Example 2: Chat API**
+
+#### LiteLLM Proxy Call 
+
+```bash
+curl -L -X POST 'http://0.0.0.0:4000/mistral/v1/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer $LITELLM_VIRTUAL_KEY' \
+-d '{
+    "messages": [
+        {
+            "role": "user",
+            "content": "I am going to Paris, what should I see?"
+        }
+    ],
+    "max_tokens": 2048,
+    "temperature": 0.8,
+    "top_p": 0.1,
+    "model": "mistral-large-latest",
+}'
+```
+
+#### Direct Mistral API Call 
+
+```bash
+curl -L -X POST 'https://api.mistral.ai/v1/chat/completions' \
+-H 'Content-Type: application/json' \
+-d '{
+    "messages": [
+        {
+            "role": "user",
+            "content": "I am going to Paris, what should I see?"
+        }
+    ],
+    "max_tokens": 2048,
+    "temperature": 0.8,
+    "top_p": 0.1,
+    "model": "mistral-large-latest",
+}'
+```
+
+
+## Advanced - Use with Virtual Keys 
+
+Pre-requisites
+- [Setup proxy with DB](../proxy/virtual_keys.md#setup)
+
+Use this, to avoid giving developers the raw Mistral API key, but still letting them use Mistral endpoints.
+
+### Usage
+
+1. Setup environment
+
+```bash
+export DATABASE_URL=""
+export LITELLM_MASTER_KEY=""
+export MISTRAL_API_BASE=""
+```
+
+```bash
+litellm
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+2. Generate virtual key 
+
+```bash
+curl -X POST 'http://0.0.0.0:4000/key/generate' \
+-H 'Authorization: Bearer sk-1234' \
+-H 'Content-Type: application/json' \
+-d '{}'
+```
+
+Expected Response 
+
+```bash
+{
+    ...
+    "key": "sk-1234ewknldferwedojwojw"
+}
+```
+
+3. Test it! 
+
+
+```bash
+curl -L -X POST 'http://0.0.0.0:4000/mistral/v1/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234ewknldferwedojwojw' \
+  --data '{
+    "messages": [
+        {
+            "role": "user",
+            "content": "I am going to Paris, what should I see?"
+        }
+    ],
+    "max_tokens": 2048,
+    "temperature": 0.8,
+    "top_p": 0.1,
+    "model": "qwen2.5-7b-instruct",
+}'
+```
--- a/docs/my-website/docs/pass_through/vertex_ai.md
+++ b/docs/my-website/docs/pass_through/vertex_ai.md
@ -13,6 +13,15 @@ Pass-through endpoints for Vertex AI - call provider-specific endpoint, in nativ
 | End-user Tracking | ❌ | [Tell us if you need this](https://github.com/BerriAI/litellm/issues/new) |
 | Streaming | ✅ | |

+## Supported Endpoints
+
+LiteLLM supports 2 vertex ai passthrough routes:
+
+1. `/vertex_ai` → routes to `https://{vertex_location}-aiplatform.googleapis.com/`
+2. `/vertex_ai/discovery` → routes to [`https://discoveryengine.googleapis.com`](https://discoveryengine.googleapis.com/)
+
+## How to use
+
 Just replace `https://REGION-aiplatform.googleapis.com` with `LITELLM_PROXY_BASE_URL/vertex_ai`

 LiteLLM supports 3 flows for calling Vertex AI endpoints via pass-through:
--- a/docs/my-website/docs/pass_through/vllm.md
+++ b/docs/my-website/docs/pass_through/vllm.md
@ -0,0 +1,185 @@
+# VLLM
+
+Pass-through endpoints for VLLM - call provider-specific endpoint, in native format (no translation).
+
+| Feature | Supported | Notes | 
+|-------|-------|-------|
+| Cost Tracking | ❌ | Not supported |
+| Logging | ✅ | works across all integrations |
+| End-user Tracking | ❌ | [Tell us if you need this](https://github.com/BerriAI/litellm/issues/new) |
+| Streaming | ✅ | |
+
+Just replace `https://my-vllm-server.com` with `LITELLM_PROXY_BASE_URL/vllm` 🚀
+
+#### **Example Usage**
+
+```bash
+curl -L -X GET 'http://0.0.0.0:4000/vllm/metrics' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+```
+
+Supports **ALL** VLLM Endpoints (including streaming).
+
+## Quick Start
+
+Let's call the VLLM [`/metrics` endpoint](https://vllm.readthedocs.io/en/latest/api_reference/api_reference.html)
+
+1. Add HOSTED VLLM API BASE to your environment 
+
+```bash
+export HOSTED_VLLM_API_BASE="https://my-vllm-server.com"
+```
+
+2. Start LiteLLM Proxy 
+
+```bash
+litellm
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+3. Test it! 
+
+Let's call the VLLM `/metrics` endpoint
+
+```bash
+curl -L -X GET 'http://0.0.0.0:4000/vllm/metrics' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+```
+
+
+## Examples
+
+Anything after `http://0.0.0.0:4000/vllm` is treated as a provider-specific route, and handled accordingly.
+
+Key Changes: 
+
+| **Original Endpoint**                                | **Replace With**                  |
+|------------------------------------------------------|-----------------------------------|
+| `https://my-vllm-server.com`          | `http://0.0.0.0:4000/vllm` (LITELLM_PROXY_BASE_URL="http://0.0.0.0:4000")      |
+| `bearer $VLLM_API_KEY`                                 | `bearer anything` (use `bearer LITELLM_VIRTUAL_KEY` if Virtual Keys are setup on proxy)                    |
+
+
+### **Example 1: Metrics endpoint**
+
+#### LiteLLM Proxy Call 
+
+```bash
+curl -L -X GET 'http://0.0.0.0:4000/vllm/metrics' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer $LITELLM_VIRTUAL_KEY' \
+```
+
+
+#### Direct VLLM API Call 
+
+```bash
+curl -L -X GET 'https://my-vllm-server.com/metrics' \
+-H 'Content-Type: application/json' \
+```
+
+### **Example 2: Chat API**
+
+#### LiteLLM Proxy Call 
+
+```bash
+curl -L -X POST 'http://0.0.0.0:4000/vllm/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer $LITELLM_VIRTUAL_KEY' \
+-d '{
+    "messages": [
+        {
+            "role": "user",
+            "content": "I am going to Paris, what should I see?"
+        }
+    ],
+    "max_tokens": 2048,
+    "temperature": 0.8,
+    "top_p": 0.1,
+    "model": "qwen2.5-7b-instruct",
+}'
+```
+
+#### Direct VLLM API Call 
+
+```bash
+curl -L -X POST 'https://my-vllm-server.com/chat/completions' \
+-H 'Content-Type: application/json' \
+-d '{
+    "messages": [
+        {
+            "role": "user",
+            "content": "I am going to Paris, what should I see?"
+        }
+    ],
+    "max_tokens": 2048,
+    "temperature": 0.8,
+    "top_p": 0.1,
+    "model": "qwen2.5-7b-instruct",
+}'
+```
+
+
+## Advanced - Use with Virtual Keys 
+
+Pre-requisites
+- [Setup proxy with DB](../proxy/virtual_keys.md#setup)
+
+Use this, to avoid giving developers the raw Cohere API key, but still letting them use Cohere endpoints.
+
+### Usage
+
+1. Setup environment
+
+```bash
+export DATABASE_URL=""
+export LITELLM_MASTER_KEY=""
+export HOSTED_VLLM_API_BASE=""
+```
+
+```bash
+litellm
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+2. Generate virtual key 
+
+```bash
+curl -X POST 'http://0.0.0.0:4000/key/generate' \
+-H 'Authorization: Bearer sk-1234' \
+-H 'Content-Type: application/json' \
+-d '{}'
+```
+
+Expected Response 
+
+```bash
+{
+    ...
+    "key": "sk-1234ewknldferwedojwojw"
+}
+```
+
+3. Test it! 
+
+
+```bash
+curl -L -X POST 'http://0.0.0.0:4000/vllm/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234ewknldferwedojwojw' \
+  --data '{
+    "messages": [
+        {
+            "role": "user",
+            "content": "I am going to Paris, what should I see?"
+        }
+    ],
+    "max_tokens": 2048,
+    "temperature": 0.8,
+    "top_p": 0.1,
+    "model": "qwen2.5-7b-instruct",
+}'
+```
--- a/docs/my-website/docs/providers/azure.md
+++ b/docs/my-website/docs/providers/azure.md
@ -1002,8 +1002,125 @@ Expected Response:
 ```


+## **Azure Responses API**

+| Property | Details |
+|-------|-------|
+| Description | Azure OpenAI Responses API |
+| `custom_llm_provider` on LiteLLM | `azure/` |
+| Supported Operations | `/v1/responses`|
+| Azure OpenAI Responses API | [Azure OpenAI Responses API ↗](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/responses?tabs=python-secure) |
+| Cost Tracking, Logging Support | ✅ LiteLLM will log, track cost for Responses API Requests |
+| Supported OpenAI Params | ✅ All OpenAI params are supported, [See here](https://github.com/BerriAI/litellm/blob/0717369ae6969882d149933da48eeb8ab0e691bd/litellm/llms/openai/responses/transformation.py#L23) |

+## Usage
+
+## Create a model response
+
+<Tabs>
+<TabItem value="litellm-sdk" label="LiteLLM SDK">
+
+#### Non-streaming
+
+```python showLineNumbers title="Azure Responses API"
+import litellm
+
+# Non-streaming response
+response = litellm.responses(
+    model="azure/o1-pro",
+    input="Tell me a three sentence bedtime story about a unicorn.",
+    max_output_tokens=100,
+    api_key=os.getenv("AZURE_RESPONSES_OPENAI_API_KEY"),
+    api_base="https://litellm8397336933.openai.azure.com/",
+    api_version="2023-03-15-preview",
+)
+
+print(response)
+```
+
+#### Streaming
+```python showLineNumbers title="Azure Responses API"
+import litellm
+
+# Streaming response
+response = litellm.responses(
+    model="azure/o1-pro",
+    input="Tell me a three sentence bedtime story about a unicorn.",
+    stream=True,
+    api_key=os.getenv("AZURE_RESPONSES_OPENAI_API_KEY"),
+    api_base="https://litellm8397336933.openai.azure.com/",
+    api_version="2023-03-15-preview",
+)
+
+for event in response:
+    print(event)
+```
+
+</TabItem>
+<TabItem value="proxy" label="OpenAI SDK with LiteLLM Proxy">
+
+First, add this to your litellm proxy config.yaml:
+```yaml showLineNumbers title="Azure Responses API"
+model_list:
+  - model_name: o1-pro
+    litellm_params:
+      model: azure/o1-pro
+      api_key: os.environ/AZURE_RESPONSES_OPENAI_API_KEY
+      api_base: https://litellm8397336933.openai.azure.com/
+      api_version: 2023-03-15-preview
+```
+
+Start your LiteLLM proxy:
+```bash
+litellm --config /path/to/config.yaml
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+Then use the OpenAI SDK pointed to your proxy:
+
+#### Non-streaming
+```python showLineNumbers
+from openai import OpenAI
+
+# Initialize client with your proxy URL
+client = OpenAI(
+    base_url="http://localhost:4000",  # Your proxy URL
+    api_key="your-api-key"             # Your proxy API key
+)
+
+# Non-streaming response
+response = client.responses.create(
+    model="o1-pro",
+    input="Tell me a three sentence bedtime story about a unicorn."
+)
+
+print(response)
+```
+
+#### Streaming
+```python showLineNumbers
+from openai import OpenAI
+
+# Initialize client with your proxy URL
+client = OpenAI(
+    base_url="http://localhost:4000",  # Your proxy URL
+    api_key="your-api-key"             # Your proxy API key
+)
+
+# Streaming response
+response = client.responses.create(
+    model="o1-pro",
+    input="Tell me a three sentence bedtime story about a unicorn.",
+    stream=True
+)
+
+for event in response:
+    print(event)
+```
+
+</TabItem>
+</Tabs>



--- a/docs/my-website/docs/providers/gemini.md
+++ b/docs/my-website/docs/providers/gemini.md
@ -39,14 +39,164 @@ response = completion(
 - temperature
 - top_p
 - max_tokens
+- max_completion_tokens
 - stream
 - tools
 - tool_choice
+- functions
 - response_format
 - n
 - stop
+- logprobs
+- frequency_penalty
+- modalities
+- reasoning_content
+
+**Anthropic Params**
+- thinking (used to set max budget tokens across anthropic/gemini models)
+
+[**See Updated List**](https://github.com/BerriAI/litellm/blob/main/litellm/llms/gemini/chat/transformation.py#L70)
+
+
+
+## Usage - Thinking / `reasoning_content`
+
+LiteLLM translates OpenAI's `reasoning_effort` to Gemini's `thinking` parameter. [Code](https://github.com/BerriAI/litellm/blob/620664921902d7a9bfb29897a7b27c1a7ef4ddfb/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py#L362)
+
+**Mapping**
+
+| reasoning_effort | thinking |
+| ---------------- | -------- |
+| "low"            | "budget_tokens": 1024 |
+| "medium"         | "budget_tokens": 2048 |
+| "high"           | "budget_tokens": 4096 |
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+
+resp = completion(
+    model="gemini/gemini-2.5-flash-preview-04-17",
+    messages=[{"role": "user", "content": "What is the capital of France?"}],
+    reasoning_effort="low",
+)
+
+```
+
+</TabItem>
+
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+- model_name: gemini-2.5-flash
+  litellm_params:
+    model: gemini/gemini-2.5-flash-preview-04-17
+    api_key: os.environ/GEMINI_API_KEY
+```
+
+2. Start proxy
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```bash
+curl http://0.0.0.0:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer <YOUR-LITELLM-KEY>" \
+  -d '{
+    "model": "gemini-2.5-flash",
+    "messages": [{"role": "user", "content": "What is the capital of France?"}],
+    "reasoning_effort": "low"
+  }'
+```
+
+</TabItem>
+</Tabs>
+
+
+**Expected Response**
+
+```python
+ModelResponse(
+    id='chatcmpl-c542d76d-f675-4e87-8e5f-05855f5d0f5e',
+    created=1740470510,
+    model='claude-3-7-sonnet-20250219',
+    object='chat.completion',
+    system_fingerprint=None,
+    choices=[
+        Choices(
+            finish_reason='stop',
+            index=0,
+            message=Message(
+                content="The capital of France is Paris.",
+                role='assistant',
+                tool_calls=None,
+                function_call=None,
+                reasoning_content='The capital of France is Paris. This is a very straightforward factual question.'
+            ),
+        )
+    ],
+    usage=Usage(
+        completion_tokens=68,
+        prompt_tokens=42,
+        total_tokens=110,
+        completion_tokens_details=None,
+        prompt_tokens_details=PromptTokensDetailsWrapper(
+            audio_tokens=None,
+            cached_tokens=0,
+            text_tokens=None,
+            image_tokens=None
+        ),
+        cache_creation_input_tokens=0,
+        cache_read_input_tokens=0
+    )
+)
+```
+
+### Pass `thinking` to Gemini models
+
+You can also pass the `thinking` parameter to Gemini models.
+
+This is translated to Gemini's [`thinkingConfig` parameter](https://ai.google.dev/gemini-api/docs/thinking#set-budget).
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+response = litellm.completion(
+  model="gemini/gemini-2.5-flash-preview-04-17",
+  messages=[{"role": "user", "content": "What is the capital of France?"}],
+  thinking={"type": "enabled", "budget_tokens": 1024},
+)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+```bash
+curl http://0.0.0.0:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $LITELLM_KEY" \
+  -d '{
+    "model": "gemini/gemini-2.5-flash-preview-04-17",
+    "messages": [{"role": "user", "content": "What is the capital of France?"}],
+    "thinking": {"type": "enabled", "budget_tokens": 1024}
+  }'
+```
+
+</TabItem>
+</Tabs>
+
+
+

-[**See Updated List**](https://github.com/BerriAI/litellm/blob/1c747f3ad372399c5b95cc5696b06a5fbe53186b/litellm/llms/vertex_httpx.py#L122)

 ## Passing Gemini Specific Params
 ### Response schema 
--- a/docs/my-website/docs/providers/openai.md
+++ b/docs/my-website/docs/providers/openai.md
@ -163,6 +163,12 @@ os.environ["OPENAI_API_BASE"] = "openaiai-api-base"     # OPTIONAL

 | Model Name            | Function Call                                                   |
 |-----------------------|-----------------------------------------------------------------|
+| gpt-4.1 | `response = completion(model="gpt-4.1", messages=messages)` |
+| gpt-4.1-mini | `response = completion(model="gpt-4.1-mini", messages=messages)` |
+| gpt-4.1-nano | `response = completion(model="gpt-4.1-nano", messages=messages)` |
+| o4-mini | `response = completion(model="o4-mini", messages=messages)` |
+| o3-mini | `response = completion(model="o3-mini", messages=messages)` |
+| o3 | `response = completion(model="o3", messages=messages)` |
 | o1-mini | `response = completion(model="o1-mini", messages=messages)` |
 | o1-preview | `response = completion(model="o1-preview", messages=messages)` |
 | gpt-4o-mini  | `response = completion(model="gpt-4o-mini", messages=messages)` |
--- a/docs/my-website/docs/providers/vertex.md
+++ b/docs/my-website/docs/providers/vertex.md
@ -347,7 +347,7 @@ Return a `list[Recipe]`
 completion(model="vertex_ai/gemini-1.5-flash-preview-0514", messages=messages, response_format={ "type": "json_object" })
 ```

-### **Grounding**
+### **Grounding - Web Search**

 Add Google Search Result grounding to vertex ai calls. 

@ -358,7 +358,7 @@ See the grounding metadata with `response_obj._hidden_params["vertex_ai_groundin
 <Tabs>
 <TabItem value="sdk" label="SDK">

-```python 
+```python showLineNumbers
 from litellm import completion 

 ## SETUP ENVIRONMENT
@ -377,14 +377,36 @@ print(resp)
 </TabItem>
 <TabItem value="proxy" label="PROXY">

-```bash
+<Tabs>
+<TabItem value="openai" label="OpenAI Python SDK">
+
+```python showLineNumbers
+from openai import OpenAI
+
+client = OpenAI(
+    api_key="sk-1234", # pass litellm proxy key, if you're using virtual keys
+    base_url="http://0.0.0.0:4000/v1/" # point to litellm proxy
+)
+
+response = client.chat.completions.create(
+    model="gemini-pro",
+    messages=[{"role": "user", "content": "Who won the world cup?"}],
+    tools=[{"googleSearchRetrieval": {}}],
+)
+
+print(response)
+```
+</TabItem>
+<TabItem value="curl" label="cURL">
+
+```bash showLineNumbers
 curl http://localhost:4000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer sk-1234" \
  -d '{
    "model": "gemini-pro",
    "messages": [
-      {"role": "user", "content": "Hello, Claude!"}
+      {"role": "user", "content": "Who won the world cup?"}
    ],
   "tools": [
        {
@ -394,12 +416,82 @@ curl http://localhost:4000/v1/chat/completions \
  }'

 ```
+</TabItem>
+</Tabs>

 </TabItem>
 </Tabs>

 You can also use the `enterpriseWebSearch` tool for an [enterprise compliant search](https://cloud.google.com/vertex-ai/generative-ai/docs/grounding/web-grounding-enterprise).

+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python showLineNumbers
+from litellm import completion 
+
+## SETUP ENVIRONMENT
+# !gcloud auth application-default login - run this to add vertex credentials to your env
+
+tools = [{"enterpriseWebSearch": {}}] # 👈 ADD GOOGLE ENTERPRISE SEARCH
+
+resp = litellm.completion(
+                    model="vertex_ai/gemini-1.0-pro-001",
+                    messages=[{"role": "user", "content": "Who won the world cup?"}],
+                    tools=tools,
+                )
+
+print(resp)
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+<Tabs>
+<TabItem value="openai" label="OpenAI Python SDK">
+
+```python showLineNumbers
+from openai import OpenAI
+
+client = OpenAI(
+    api_key="sk-1234", # pass litellm proxy key, if you're using virtual keys
+    base_url="http://0.0.0.0:4000/v1/" # point to litellm proxy
+)
+
+response = client.chat.completions.create(
+    model="gemini-pro",
+    messages=[{"role": "user", "content": "Who won the world cup?"}],
+    tools=[{"enterpriseWebSearch": {}}],
+)
+
+print(response)
+```
+</TabItem>
+<TabItem value="curl" label="cURL">
+
+```bash showLineNumbers
+curl http://localhost:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer sk-1234" \
+  -d '{
+    "model": "gemini-pro",
+    "messages": [
+      {"role": "user", "content": "Who won the world cup?"}
+    ],
+   "tools": [
+        {
+            "enterpriseWebSearch": {} 
+        }
+    ]
+  }'
+
+```
+</TabItem>
+</Tabs>
+
+</TabItem>
+</Tabs>
+
+
 #### **Moving from Vertex AI SDK to LiteLLM (GROUNDING)**


@ -450,6 +542,154 @@ print(resp)
 ```


+### **Thinking / `reasoning_content`**
+
+LiteLLM translates OpenAI's `reasoning_effort` to Gemini's `thinking` parameter. [Code](https://github.com/BerriAI/litellm/blob/620664921902d7a9bfb29897a7b27c1a7ef4ddfb/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py#L362)
+
+**Mapping**
+
+| reasoning_effort | thinking |
+| ---------------- | -------- |
+| "low"            | "budget_tokens": 1024 |
+| "medium"         | "budget_tokens": 2048 |
+| "high"           | "budget_tokens": 4096 |
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+
+# !gcloud auth application-default login - run this to add vertex credentials to your env
+
+resp = completion(
+    model="vertex_ai/gemini-2.5-flash-preview-04-17",
+    messages=[{"role": "user", "content": "What is the capital of France?"}],
+    reasoning_effort="low",
+    vertex_project="project-id",
+    vertex_location="us-central1"
+)
+
+```
+
+</TabItem>
+
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+- model_name: gemini-2.5-flash
+  litellm_params:
+    model: vertex_ai/gemini-2.5-flash-preview-04-17
+    vertex_credentials: {"project_id": "project-id", "location": "us-central1", "project_key": "project-key"}
+    vertex_project: "project-id"
+    vertex_location: "us-central1"
+```
+
+2. Start proxy
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```bash
+curl http://0.0.0.0:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer <YOUR-LITELLM-KEY>" \
+  -d '{
+    "model": "gemini-2.5-flash",
+    "messages": [{"role": "user", "content": "What is the capital of France?"}],
+    "reasoning_effort": "low"
+  }'
+```
+
+</TabItem>
+</Tabs>
+
+
+**Expected Response**
+
+```python
+ModelResponse(
+    id='chatcmpl-c542d76d-f675-4e87-8e5f-05855f5d0f5e',
+    created=1740470510,
+    model='claude-3-7-sonnet-20250219',
+    object='chat.completion',
+    system_fingerprint=None,
+    choices=[
+        Choices(
+            finish_reason='stop',
+            index=0,
+            message=Message(
+                content="The capital of France is Paris.",
+                role='assistant',
+                tool_calls=None,
+                function_call=None,
+                reasoning_content='The capital of France is Paris. This is a very straightforward factual question.'
+            ),
+        )
+    ],
+    usage=Usage(
+        completion_tokens=68,
+        prompt_tokens=42,
+        total_tokens=110,
+        completion_tokens_details=None,
+        prompt_tokens_details=PromptTokensDetailsWrapper(
+            audio_tokens=None,
+            cached_tokens=0,
+            text_tokens=None,
+            image_tokens=None
+        ),
+        cache_creation_input_tokens=0,
+        cache_read_input_tokens=0
+    )
+)
+```
+
+#### Pass `thinking` to Gemini models
+
+You can also pass the `thinking` parameter to Gemini models.
+
+This is translated to Gemini's [`thinkingConfig` parameter](https://ai.google.dev/gemini-api/docs/thinking#set-budget).
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+
+# !gcloud auth application-default login - run this to add vertex credentials to your env
+
+response = litellm.completion(
+  model="vertex_ai/gemini-2.5-flash-preview-04-17",
+  messages=[{"role": "user", "content": "What is the capital of France?"}],
+  thinking={"type": "enabled", "budget_tokens": 1024},
+  vertex_project="project-id",
+  vertex_location="us-central1"
+)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+```bash
+curl http://0.0.0.0:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $LITELLM_KEY" \
+  -d '{
+    "model": "vertex_ai/gemini-2.5-flash-preview-04-17",
+    "messages": [{"role": "user", "content": "What is the capital of France?"}],
+    "thinking": {"type": "enabled", "budget_tokens": 1024}
+  }'
+```
+
+</TabItem>
+</Tabs>
+
+
 ### **Context Caching**

 Use Vertex AI context caching is supported by calling provider api directly. (Unified Endpoint support comin soon.).
--- a/docs/my-website/docs/providers/vllm.md
+++ b/docs/my-website/docs/providers/vllm.md
@ -161,6 +161,120 @@ curl -L -X POST 'http://0.0.0.0:4000/embeddings' \

 Example Implementation from VLLM [here](https://github.com/vllm-project/vllm/pull/10020)

+<Tabs>
+<TabItem value="files_message" label="(Unified) Files Message">
+
+Use this to send a video url to VLLM + Gemini in the same format, using OpenAI's `files` message type.
+
+There are two ways to send a video url to VLLM:
+
+1. Pass the video url directly
+
+```
+{"type": "file", "file": {"file_id": video_url}},
+```
+
+2. Pass the video data as base64
+
+```
+{"type": "file", "file": {"file_data": f"data:video/mp4;base64,{video_data_base64}"}}
+```
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+
+messages=[
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "Summarize the following video"
+            },
+            {
+                "type": "file",
+                "file": {
+                    "file_id": "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
+                }
+            }
+        ]
+    }
+]
+
+# call vllm 
+os.environ["HOSTED_VLLM_API_BASE"] = "https://hosted-vllm-api.co"
+os.environ["HOSTED_VLLM_API_KEY"] = "" # [optional], if your VLLM server requires an API key
+response = completion(
+    model="hosted_vllm/qwen", # pass the vllm model name
+    messages=messages,
+)
+
+# call gemini 
+os.environ["GEMINI_API_KEY"] = "your-gemini-api-key"
+response = completion(
+    model="gemini/gemini-1.5-flash", # pass the gemini model name
+    messages=messages,
+)
+
+print(response)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+    - model_name: my-model
+      litellm_params:
+        model: hosted_vllm/qwen  # add hosted_vllm/ prefix to route as OpenAI provider
+        api_base: https://hosted-vllm-api.co      # add api base for OpenAI compatible provider
+    - model_name: my-gemini-model
+      litellm_params:
+        model: gemini/gemini-1.5-flash  # add gemini/ prefix to route as Google AI Studio provider
+        api_key: os.environ/GEMINI_API_KEY
+```
+
+2. Start the proxy 
+
+```bash
+$ litellm --config /path/to/config.yaml
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+3. Test it! 
+
+```bash
+curl -X POST http://0.0.0.0:4000/chat/completions \
+-H "Authorization: Bearer sk-1234" \
+-H "Content-Type: application/json" \
+-d '{
+    "model": "my-model",
+    "messages": [
+        {"role": "user", "content": 
+            [
+                {"type": "text", "text": "Summarize the following video"},
+                {"type": "file", "file": {"file_id": "https://www.youtube.com/watch?v=dQw4w9WgXcQ"}}
+            ]
+        }
+    ]
+}'
+```
+
+</TabItem>
+</Tabs>
+
+
+</TabItem>
+<TabItem value="video_url" label="(VLLM-specific) Video Message">
+
+Use this to send a video url to VLLM in it's native message format (`video_url`).
+
 There are two ways to send a video url to VLLM:

 1. Pass the video url directly
@ -249,6 +363,10 @@ curl -X POST http://0.0.0.0:4000/chat/completions \
 </Tabs>


+</TabItem>
+</Tabs>
+
+
 ## (Deprecated) for `vllm pip package` 
 ### Using - `litellm.completion`

--- a/docs/my-website/docs/providers/xai.md
+++ b/docs/my-website/docs/providers/xai.md
@ -176,3 +176,81 @@ Here's how to call a XAI model with the LiteLLM Proxy Server
  </Tabs>


+## Reasoning Usage
+
+LiteLLM supports reasoning usage for xAI models.
+
+<Tabs>
+
+<TabItem value="python" label="LiteLLM Python SDK">
+
+```python showLineNumbers title="reasoning with xai/grok-3-mini-beta"
+import litellm
+response = litellm.completion(
+    model="xai/grok-3-mini-beta",
+    messages=[{"role": "user", "content": "What is 101*3?"}],
+    reasoning_effort="low",
+)
+
+print("Reasoning Content:")
+print(response.choices[0].message.reasoning_content)
+
+print("\nFinal Response:")
+print(completion.choices[0].message.content)
+
+print("\nNumber of completion tokens (input):")
+print(completion.usage.completion_tokens)
+
+print("\nNumber of reasoning tokens (input):")
+print(completion.usage.completion_tokens_details.reasoning_tokens)
+```
+</TabItem>
+
+<TabItem value="curl" label="LiteLLM Proxy - OpenAI SDK Usage">
+
+```python showLineNumbers title="reasoning with xai/grok-3-mini-beta"
+import openai
+client = openai.OpenAI(
+    api_key="sk-1234",             # pass litellm proxy key, if you're using virtual keys
+    base_url="http://0.0.0.0:4000" # litellm-proxy-base url
+)
+
+response = client.chat.completions.create(
+    model="xai/grok-3-mini-beta",
+    messages=[{"role": "user", "content": "What is 101*3?"}],
+    reasoning_effort="low",
+)
+
+print("Reasoning Content:")
+print(response.choices[0].message.reasoning_content)
+
+print("\nFinal Response:")
+print(completion.choices[0].message.content)
+
+print("\nNumber of completion tokens (input):")
+print(completion.usage.completion_tokens)
+
+print("\nNumber of reasoning tokens (input):")
+print(completion.usage.completion_tokens_details.reasoning_tokens)
+```
+
+</TabItem>
+</Tabs>
+
+**Example Response:**
+
+```shell
+Reasoning Content:
+Let me calculate 101 multiplied by 3:
+101 * 3 = 303.
+I can double-check that: 100 * 3 is 300, and 1 * 3 is 3, so 300 + 3 = 303. Yes, that's correct.
+
+Final Response:
+The result of 101 multiplied by 3 is 303.
+
+Number of completion tokens (input):
+14
+
+Number of reasoning tokens (input):
+310
+```
--- a/docs/my-website/docs/proxy/config_settings.md
+++ b/docs/my-website/docs/proxy/config_settings.md
@ -323,6 +323,9 @@ router_settings:
 | AZURE_AUTHORITY_HOST | Azure authority host URL
 | AZURE_CLIENT_ID | Client ID for Azure services
 | AZURE_CLIENT_SECRET | Client secret for Azure services
+| AZURE_TENANT_ID | Tenant ID for Azure Active Directory
+| AZURE_USERNAME | Username for Azure services, use in conjunction with AZURE_PASSWORD for azure ad token with basic username/password workflow
+| AZURE_PASSWORD | Password for Azure services, use in conjunction with AZURE_USERNAME for azure ad token with basic username/password workflow
 | AZURE_FEDERATED_TOKEN_FILE | File path to Azure federated token
 | AZURE_KEY_VAULT_URI | URI for Azure Key Vault
 | AZURE_STORAGE_ACCOUNT_KEY | The Azure Storage Account Key to use for Authentication to Azure Blob Storage logging
@ -331,7 +334,6 @@ router_settings:
 | AZURE_STORAGE_TENANT_ID | The Application Tenant ID to use for Authentication to Azure Blob Storage logging
 | AZURE_STORAGE_CLIENT_ID | The Application Client ID to use for Authentication to Azure Blob Storage logging
 | AZURE_STORAGE_CLIENT_SECRET | The Application Client Secret to use for Authentication to Azure Blob Storage logging
-| AZURE_TENANT_ID | Tenant ID for Azure Active Directory
 | BERRISPEND_ACCOUNT_ID | Account ID for BerriSpend service
 | BRAINTRUST_API_KEY | API key for Braintrust integration
 | CIRCLE_OIDC_TOKEN | OpenID Connect token for CircleCI
@ -433,6 +435,7 @@ router_settings:
 | LITERAL_BATCH_SIZE | Batch size for Literal operations
 | LITELLM_DONT_SHOW_FEEDBACK_BOX | Flag to hide feedback box in LiteLLM UI
 | LITELLM_DROP_PARAMS | Parameters to drop in LiteLLM requests
+| LITELLM_MODIFY_PARAMS | Parameters to modify in LiteLLM requests
 | LITELLM_EMAIL | Email associated with LiteLLM account
 | LITELLM_GLOBAL_MAX_PARALLEL_REQUEST_RETRIES | Maximum retries for parallel requests in LiteLLM
 | LITELLM_GLOBAL_MAX_PARALLEL_REQUEST_RETRY_TIMEOUT | Timeout for retries of parallel requests in LiteLLM
@ -446,6 +449,8 @@ router_settings:
 | LITELLM_TOKEN | Access token for LiteLLM integration
 | LITELLM_PRINT_STANDARD_LOGGING_PAYLOAD | If true, prints the standard logging payload to the console - useful for debugging
 | LOGFIRE_TOKEN | Token for Logfire logging service
+| MISTRAL_API_BASE | Base URL for Mistral API
+| MISTRAL_API_KEY | API key for Mistral API
 | MICROSOFT_CLIENT_ID | Client ID for Microsoft services
 | MICROSOFT_CLIENT_SECRET | Client secret for Microsoft services
 | MICROSOFT_TENANT | Tenant ID for Microsoft Azure
--- a/docs/my-website/docs/proxy/cost_tracking.md
+++ b/docs/my-website/docs/proxy/cost_tracking.md
@ -205,28 +205,6 @@ curl -X POST \
 {"message":"Spend for all API Keys and Teams reset successfully","status":"success"}
 ```

-
-## Set 'base_model' for Cost Tracking (e.g. Azure deployments)
-
-**Problem**: Azure returns `gpt-4` in the response when `azure/gpt-4-1106-preview` is used. This leads to inaccurate cost tracking
-
-**Solution** ✅ :  Set `base_model` on your config so litellm uses the correct model for calculating azure cost
-
-Get the base model name from [here](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)
-
-Example config with `base_model`
-```yaml
-model_list:
-  - model_name: azure-gpt-3.5
-    litellm_params:
-      model: azure/chatgpt-v-2
-      api_base: os.environ/AZURE_API_BASE
-      api_key: os.environ/AZURE_API_KEY
-      api_version: "2023-07-01-preview"
-    model_info:
-      base_model: azure/gpt-4-1106-preview
-```
-
 ## Daily Spend Breakdown API

 Retrieve granular daily usage data for a user (by model, provider, and API key) with a single endpoint.
--- a/docs/my-website/docs/proxy/custom_pricing.md
+++ b/docs/my-website/docs/proxy/custom_pricing.md
@ -83,6 +83,28 @@ model_list:
      cache_read_input_token_cost: 0.0000006
 ```

+## Set 'base_model' for Cost Tracking (e.g. Azure deployments)
+
+**Problem**: Azure returns `gpt-4` in the response when `azure/gpt-4-1106-preview` is used. This leads to inaccurate cost tracking
+
+**Solution** ✅ :  Set `base_model` on your config so litellm uses the correct model for calculating azure cost
+
+Get the base model name from [here](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)
+
+Example config with `base_model`
+```yaml
+model_list:
+  - model_name: azure-gpt-3.5
+    litellm_params:
+      model: azure/chatgpt-v-2
+      api_base: os.environ/AZURE_API_BASE
+      api_key: os.environ/AZURE_API_KEY
+      api_version: "2023-07-01-preview"
+    model_info:
+      base_model: azure/gpt-4-1106-preview
+```
+
+
 ## Debugging 

 If you're custom pricing is not being used or you're seeing errors, please check the following:
--- a/docs/my-website/docs/proxy/litellm_managed_files.md
+++ b/docs/my-website/docs/proxy/litellm_managed_files.md
@ -0,0 +1,279 @@
+import TabItem from '@theme/TabItem';
+import Tabs from '@theme/Tabs';
+import Image from '@theme/IdealImage';
+
+# [BETA] Unified File ID
+
+Reuse the same 'file id' across different providers.
+
+| Feature | Description | Comments |
+| --- | --- | --- |
+| Proxy | ✅ |  |
+| SDK | ❌ | Requires postgres DB for storing file ids |
+| Available across all providers | ✅ |  |
+
+
+
+Limitations of LiteLLM Managed Files:
+- Only works for `/chat/completions` requests. 
+- Assumes just 1 model configured per model_name.
+
+Follow [here](https://github.com/BerriAI/litellm/discussions/9632) for multiple models, batches support.
+
+### 1. Setup config.yaml
+
+```
+model_list:
+    - model_name: "gemini-2.0-flash"
+      litellm_params:
+        model: vertex_ai/gemini-2.0-flash
+        vertex_project: my-project-id
+        vertex_location: us-central1
+    - model_name: "gpt-4o-mini-openai"
+      litellm_params:
+        model: gpt-4o-mini
+        api_key: os.environ/OPENAI_API_KEY
+```
+
+### 2. Start proxy
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+### 3. Test it!
+
+Specify `target_model_names` to use the same file id across different providers. This is the list of model_names set via config.yaml (or 'public_model_names' on UI). 
+
+```python
+target_model_names="gpt-4o-mini-openai, gemini-2.0-flash" # 👈 Specify model_names
+```
+
+Check `/v1/models` to see the list of available model names for a key.
+
+#### **Store a PDF file**
+
+```python
+from openai import OpenAI
+
+client = OpenAI(base_url="http://0.0.0.0:4000", api_key="sk-1234", max_retries=0)
+
+
+# Download and save the PDF locally 
+url = (
+    "https://storage.googleapis.com/cloud-samples-data/generative-ai/pdf/2403.05530.pdf"
+)
+response = requests.get(url)
+response.raise_for_status()
+
+# Save the PDF locally
+with open("2403.05530.pdf", "wb") as f:
+    f.write(response.content)
+
+file = client.files.create(
+    file=open("2403.05530.pdf", "rb"),
+    purpose="user_data", # can be any openai 'purpose' value
+    extra_body={"target_model_names": "gpt-4o-mini-openai, gemini-2.0-flash"}, # 👈 Specify model_names
+)
+
+print(f"file id={file.id}")
+```
+
+#### **Use the same file id across different providers**
+
+<Tabs>
+<TabItem value="openai" label="OpenAI">
+
+```python
+completion = client.chat.completions.create(
+    model="gpt-4o-mini-openai",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What is in this recording?"},
+                {
+                    "type": "file",
+                    "file": {
+                        "file_id": file.id,
+                    },
+                },
+            ],
+        },
+    ]
+)
+
+print(completion.choices[0].message)
+```
+
+
+</TabItem>
+<TabItem value="vertex" label="Vertex AI">
+
+```python
+completion = client.chat.completions.create(
+    model="gemini-2.0-flash",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What is in this recording?"},
+                {
+                    "type": "file",
+                    "file": {
+                        "file_id": file.id,
+                    },
+                },
+            ],
+        },
+    ]
+)
+
+print(completion.choices[0].message)
+
+```
+
+</TabItem>
+</Tabs>
+
+### Complete Example
+
+```python   
+import base64
+import requests
+from openai import OpenAI
+
+client = OpenAI(base_url="http://0.0.0.0:4000", api_key="sk-1234", max_retries=0)
+
+
+# Download and save the PDF locally
+url = (
+    "https://storage.googleapis.com/cloud-samples-data/generative-ai/pdf/2403.05530.pdf"
+)
+response = requests.get(url)
+response.raise_for_status()
+
+# Save the PDF locally
+with open("2403.05530.pdf", "wb") as f:
+    f.write(response.content)
+
+# Read the local PDF file
+file = client.files.create(
+    file=open("2403.05530.pdf", "rb"),
+    purpose="user_data", # can be any openai 'purpose' value
+    extra_body={"target_model_names": "gpt-4o-mini-openai, vertex_ai/gemini-2.0-flash"},
+)
+
+print(f"file.id: {file.id}") # 👈 Unified file id
+
+## GEMINI CALL ### 
+completion = client.chat.completions.create(
+    model="gemini-2.0-flash",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What is in this recording?"},
+                {
+                    "type": "file",
+                    "file": {
+                        "file_id": file.id,
+                    },
+                },
+            ],
+        },
+    ]
+)
+
+print(completion.choices[0].message)
+
+
+### OPENAI CALL ### 
+completion = client.chat.completions.create(
+    model="gpt-4o-mini-openai",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What is in this recording?"},
+                {
+                    "type": "file",
+                    "file": {
+                        "file_id": file.id,
+                    },
+                },
+            ],
+        },
+    ],
+)
+
+print(completion.choices[0].message)
+
+```
+
+
+### Supported Endpoints
+
+#### Create a file - `/files`
+
+```python
+from openai import OpenAI
+
+client = OpenAI(base_url="http://0.0.0.0:4000", api_key="sk-1234", max_retries=0)
+
+# Download and save the PDF locally
+url = (
+    "https://storage.googleapis.com/cloud-samples-data/generative-ai/pdf/2403.05530.pdf"
+)
+response = requests.get(url)
+response.raise_for_status()
+
+# Save the PDF locally
+with open("2403.05530.pdf", "wb") as f:
+    f.write(response.content)
+
+# Read the local PDF file
+file = client.files.create(
+    file=open("2403.05530.pdf", "rb"),
+    purpose="user_data", # can be any openai 'purpose' value
+    extra_body={"target_model_names": "gpt-4o-mini-openai, vertex_ai/gemini-2.0-flash"},
+)
+```
+
+#### Retrieve a file - `/files/{file_id}`
+
+```python
+client = OpenAI(base_url="http://0.0.0.0:4000", api_key="sk-1234", max_retries=0)
+
+file = client.files.retrieve(file_id=file.id)
+```
+
+#### Delete a file - `/files/{file_id}/delete`
+
+```python
+client = OpenAI(base_url="http://0.0.0.0:4000", api_key="sk-1234", max_retries=0)
+
+file = client.files.delete(file_id=file.id)
+```
+
+### FAQ
+
+**1. Does LiteLLM store the file?**
+
+No, LiteLLM does not store the file. It only stores the file id's in the postgres DB.
+
+**2. How does LiteLLM know which file to use for a given file id?**
+
+LiteLLM stores a mapping of the litellm file id to the model-specific file id in the postgres DB. When a request comes in, LiteLLM looks up the model-specific file id and uses it in the request to the provider.
+
+**3. How do file deletions work?**
+
+When a file is deleted, LiteLLM deletes the mapping from the postgres DB, and the files on each provider.
+
+### Architecture
+
+
+
+
+
+<Image img={require('../../img/managed_files_arch.png')}  style={{ width: '800px', height: 'auto' }} />
--- a/docs/my-website/docs/proxy/logging.md
+++ b/docs/my-website/docs/proxy/logging.md
@ -862,7 +862,7 @@ Add the following to your env

 ```shell
 OTEL_EXPORTER="otlp_http"
-OTEL_ENDPOINT="http:/0.0.0.0:4317"
+OTEL_ENDPOINT="http://0.0.0.0:4317"
 OTEL_HEADERS="x-honeycomb-team=<your-api-key>" # Optional
 ```

@ -2501,4 +2501,4 @@ litellm_settings:
 :::info
 `thresholds` are not required by default, but you can tune the values to your needs.
 Default values is `4` for all categories
-::: -->
+::: -->
--- a/docs/my-website/docs/proxy/model_discovery.md
+++ b/docs/my-website/docs/proxy/model_discovery.md
@ -0,0 +1,108 @@
+# Model Discovery
+
+Use this to give users an accurate list of models available behind provider endpoint, when calling `/v1/models` for wildcard models.
+
+## Supported Models
+
+- Fireworks AI
+- OpenAI
+- Gemini
+- LiteLLM Proxy
+- Topaz
+- Anthropic
+- XAI
+- VLLM
+- Vertex AI
+
+### Usage
+
+**1. Setup config.yaml**
+
+```yaml
+model_list:
+    - model_name: xai/*
+      litellm_params:
+        model: xai/*
+        api_key: os.environ/XAI_API_KEY
+
+litellm_settings:
+    check_provider_endpoint: true # 👈 Enable checking provider endpoint for wildcard models
+```
+
+**2. Start proxy**
+
+```bash
+litellm --config /path/to/config.yaml
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+**3. Call `/v1/models`**
+
+```bash
+curl -X GET "http://localhost:4000/v1/models" -H "Authorization: Bearer $LITELLM_KEY"
+```
+
+Expected response
+
+```json
+{
+    "data": [
+        {
+            "id": "xai/grok-2-1212",
+            "object": "model",
+            "created": 1677610602,
+            "owned_by": "openai"
+        },
+        {
+            "id": "xai/grok-2-vision-1212",
+            "object": "model",
+            "created": 1677610602,
+            "owned_by": "openai"
+        },
+        {
+            "id": "xai/grok-3-beta",
+            "object": "model",
+            "created": 1677610602,
+            "owned_by": "openai"
+        },
+        {
+            "id": "xai/grok-3-fast-beta",
+            "object": "model",
+            "created": 1677610602,
+            "owned_by": "openai"
+        },
+        {
+            "id": "xai/grok-3-mini-beta",
+            "object": "model",
+            "created": 1677610602,
+            "owned_by": "openai"
+        },
+        {
+            "id": "xai/grok-3-mini-fast-beta",
+            "object": "model",
+            "created": 1677610602,
+            "owned_by": "openai"
+        },
+        {
+            "id": "xai/grok-beta",
+            "object": "model",
+            "created": 1677610602,
+            "owned_by": "openai"
+        },
+        {
+            "id": "xai/grok-vision-beta",
+            "object": "model",
+            "created": 1677610602,
+            "owned_by": "openai"
+        },
+        {
+            "id": "xai/grok-2-image-1212",
+            "object": "model",
+            "created": 1677610602,
+            "owned_by": "openai"
+        }
+    ],
+    "object": "list"
+}
+```
--- a/docs/my-website/docs/proxy/prometheus.md
+++ b/docs/my-website/docs/proxy/prometheus.md
@ -95,7 +95,14 @@ Use this for for tracking per [user, key, team, etc.](virtual_keys)

 ### Initialize Budget Metrics on Startup

-If you want to initialize the key/team budget metrics on startup, you can set the `prometheus_initialize_budget_metrics` to `true` in the `config.yaml`
+If you want litellm to emit the budget metrics for all keys, teams irrespective of whether they are getting requests or not, set `prometheus_initialize_budget_metrics` to `true` in the `config.yaml`
+
+**How this works:**
+
+- If the `prometheus_initialize_budget_metrics` is set to `true`
+  - Every 5 minutes litellm runs a cron job to read all keys, teams from the database
+  - It then emits the budget metrics for each key, team
+  - This is used to populate the budget metrics on the `/metrics` endpoint

 ```yaml
 litellm_settings:
--- a/docs/my-website/docs/reasoning_content.md
+++ b/docs/my-website/docs/reasoning_content.md
@ -16,6 +16,8 @@ Supported Providers:
 - Vertex AI (Anthropic) (`vertexai/`)
 - OpenRouter (`openrouter/`)
 - XAI (`xai/`)
+- Google AI Studio (`google/`)
+- Vertex AI (`vertex_ai/`)

 LiteLLM will standardize the `reasoning_content` in the response and `thinking_blocks` in the assistant message.

@ -23,7 +25,7 @@ LiteLLM will standardize the `reasoning_content` in the response and `thinking_b
 "message": {
    ...
    "reasoning_content": "The capital of France is Paris.",
-    "thinking_blocks": [
+    "thinking_blocks": [ # only returned for Anthropic models
        {
            "type": "thinking",
            "thinking": "The capital of France is Paris.",
--- a/docs/my-website/docs/response_api.md
+++ b/docs/my-website/docs/response_api.md
@ -14,22 +14,22 @@ LiteLLM provides a BETA endpoint in the spec of [OpenAI's `/responses` API](http
 | Fallbacks | ✅ | Works between supported models |
 | Loadbalancing | ✅ | Works between supported models |
 | Supported LiteLLM Versions | 1.63.8+ | |
-| Supported LLM providers | `openai` | |
+| Supported LLM providers | **All LiteLLM supported providers** | `openai`, `anthropic`, `bedrock`, `vertex_ai`, `gemini`, `azure`, `azure_ai` etc. |

 ## Usage

-## Create a model response
+### LiteLLM Python SDK

 <Tabs>
-<TabItem value="litellm-sdk" label="LiteLLM SDK">
+<TabItem value="openai" label="OpenAI">

 #### Non-streaming
-```python
+```python showLineNumbers title="OpenAI Non-streaming Response"
 import litellm

 # Non-streaming response
 response = litellm.responses(
-    model="o1-pro",
+    model="openai/o1-pro",
    input="Tell me a three sentence bedtime story about a unicorn.",
    max_output_tokens=100
 )
@ -38,12 +38,12 @@ print(response)
 ```

 #### Streaming
-```python
+```python showLineNumbers title="OpenAI Streaming Response"
 import litellm

 # Streaming response
 response = litellm.responses(
-    model="o1-pro",
+    model="openai/o1-pro",
    input="Tell me a three sentence bedtime story about a unicorn.",
    stream=True
 )
@ -53,58 +53,169 @@ for event in response:
 ```

 </TabItem>
-<TabItem value="proxy" label="OpenAI SDK with LiteLLM Proxy">

-First, add this to your litellm proxy config.yaml:
-```yaml
-model_list:
-  - model_name: o1-pro
-    litellm_params:
-      model: openai/o1-pro
-      api_key: os.environ/OPENAI_API_KEY
-```
-
-Start your LiteLLM proxy:
-```bash
-litellm --config /path/to/config.yaml
-
-# RUNNING on http://0.0.0.0:4000
-```
-
-Then use the OpenAI SDK pointed to your proxy:
+<TabItem value="anthropic" label="Anthropic">

 #### Non-streaming
-```python
-from openai import OpenAI
+```python showLineNumbers title="Anthropic Non-streaming Response"
+import litellm
+import os

-# Initialize client with your proxy URL
-client = OpenAI(
-    base_url="http://localhost:4000",  # Your proxy URL
-    api_key="your-api-key"             # Your proxy API key
-)
+# Set API key
+os.environ["ANTHROPIC_API_KEY"] = "your-anthropic-api-key"

 # Non-streaming response
-response = client.responses.create(
-    model="o1-pro",
-    input="Tell me a three sentence bedtime story about a unicorn."
+response = litellm.responses(
+    model="anthropic/claude-3-5-sonnet-20240620",
+    input="Tell me a three sentence bedtime story about a unicorn.",
+    max_output_tokens=100
 )

 print(response)
 ```

 #### Streaming
-```python
-from openai import OpenAI
+```python showLineNumbers title="Anthropic Streaming Response"
+import litellm
+import os

-# Initialize client with your proxy URL
-client = OpenAI(
-    base_url="http://localhost:4000",  # Your proxy URL
-    api_key="your-api-key"             # Your proxy API key
-)
+# Set API key
+os.environ["ANTHROPIC_API_KEY"] = "your-anthropic-api-key"

 # Streaming response
-response = client.responses.create(
-    model="o1-pro",
+response = litellm.responses(
+    model="anthropic/claude-3-5-sonnet-20240620",
+    input="Tell me a three sentence bedtime story about a unicorn.",
+    stream=True
+)
+
+for event in response:
+    print(event)
+```
+
+</TabItem>
+
+<TabItem value="vertex" label="Vertex AI">
+
+#### Non-streaming
+```python showLineNumbers title="Vertex AI Non-streaming Response"
+import litellm
+import os
+
+# Set credentials - Vertex AI uses application default credentials
+# Run 'gcloud auth application-default login' to authenticate
+os.environ["VERTEXAI_PROJECT"] = "your-gcp-project-id"
+os.environ["VERTEXAI_LOCATION"] = "us-central1"
+
+# Non-streaming response
+response = litellm.responses(
+    model="vertex_ai/gemini-1.5-pro",
+    input="Tell me a three sentence bedtime story about a unicorn.",
+    max_output_tokens=100
+)
+
+print(response)
+```
+
+#### Streaming
+```python showLineNumbers title="Vertex AI Streaming Response"
+import litellm
+import os
+
+# Set credentials - Vertex AI uses application default credentials
+# Run 'gcloud auth application-default login' to authenticate
+os.environ["VERTEXAI_PROJECT"] = "your-gcp-project-id"
+os.environ["VERTEXAI_LOCATION"] = "us-central1"
+
+# Streaming response
+response = litellm.responses(
+    model="vertex_ai/gemini-1.5-pro",
+    input="Tell me a three sentence bedtime story about a unicorn.",
+    stream=True
+)
+
+for event in response:
+    print(event)
+```
+
+</TabItem>
+
+<TabItem value="bedrock" label="AWS Bedrock">
+
+#### Non-streaming
+```python showLineNumbers title="AWS Bedrock Non-streaming Response"
+import litellm
+import os
+
+# Set AWS credentials
+os.environ["AWS_ACCESS_KEY_ID"] = "your-access-key-id"
+os.environ["AWS_SECRET_ACCESS_KEY"] = "your-secret-access-key"
+os.environ["AWS_REGION_NAME"] = "us-west-2"  # or your AWS region
+
+# Non-streaming response
+response = litellm.responses(
+    model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
+    input="Tell me a three sentence bedtime story about a unicorn.",
+    max_output_tokens=100
+)
+
+print(response)
+```
+
+#### Streaming
+```python showLineNumbers title="AWS Bedrock Streaming Response"
+import litellm
+import os
+
+# Set AWS credentials
+os.environ["AWS_ACCESS_KEY_ID"] = "your-access-key-id"
+os.environ["AWS_SECRET_ACCESS_KEY"] = "your-secret-access-key"
+os.environ["AWS_REGION_NAME"] = "us-west-2"  # or your AWS region
+
+# Streaming response
+response = litellm.responses(
+    model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
+    input="Tell me a three sentence bedtime story about a unicorn.",
+    stream=True
+)
+
+for event in response:
+    print(event)
+```
+
+</TabItem>
+
+<TabItem value="gemini" label="Google AI Studio">
+
+#### Non-streaming
+```python showLineNumbers title="Google AI Studio Non-streaming Response"
+import litellm
+import os
+
+# Set API key for Google AI Studio
+os.environ["GEMINI_API_KEY"] = "your-gemini-api-key"
+
+# Non-streaming response
+response = litellm.responses(
+    model="gemini/gemini-1.5-flash",
+    input="Tell me a three sentence bedtime story about a unicorn.",
+    max_output_tokens=100
+)
+
+print(response)
+```
+
+#### Streaming
+```python showLineNumbers title="Google AI Studio Streaming Response"
+import litellm
+import os
+
+# Set API key for Google AI Studio
+os.environ["GEMINI_API_KEY"] = "your-gemini-api-key"
+
+# Streaming response
+response = litellm.responses(
+    model="gemini/gemini-1.5-flash",
    input="Tell me a three sentence bedtime story about a unicorn.",
    stream=True
 )
@ -115,3 +226,297 @@ for event in response:

 </TabItem>
 </Tabs>
+
+### LiteLLM Proxy with OpenAI SDK
+
+First, set up and start your LiteLLM proxy server.
+
+```bash title="Start LiteLLM Proxy Server"
+litellm --config /path/to/config.yaml
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+<Tabs>
+<TabItem value="openai" label="OpenAI">
+
+First, add this to your litellm proxy config.yaml:
+```yaml showLineNumbers title="OpenAI Proxy Configuration"
+model_list:
+  - model_name: openai/o1-pro
+    litellm_params:
+      model: openai/o1-pro
+      api_key: os.environ/OPENAI_API_KEY
+```
+
+#### Non-streaming
+```python showLineNumbers title="OpenAI Proxy Non-streaming Response"
+from openai import OpenAI
+
+# Initialize client with your proxy URL
+client = OpenAI(
+    base_url="http://localhost:4000",  # Your proxy URL
+    api_key="your-api-key"             # Your proxy API key
+)
+
+# Non-streaming response
+response = client.responses.create(
+    model="openai/o1-pro",
+    input="Tell me a three sentence bedtime story about a unicorn."
+)
+
+print(response)
+```
+
+#### Streaming
+```python showLineNumbers title="OpenAI Proxy Streaming Response"
+from openai import OpenAI
+
+# Initialize client with your proxy URL
+client = OpenAI(
+    base_url="http://localhost:4000",  # Your proxy URL
+    api_key="your-api-key"             # Your proxy API key
+)
+
+# Streaming response
+response = client.responses.create(
+    model="openai/o1-pro",
+    input="Tell me a three sentence bedtime story about a unicorn.",
+    stream=True
+)
+
+for event in response:
+    print(event)
+```
+
+</TabItem>
+
+<TabItem value="anthropic" label="Anthropic">
+
+First, add this to your litellm proxy config.yaml:
+```yaml showLineNumbers title="Anthropic Proxy Configuration"
+model_list:
+  - model_name: anthropic/claude-3-5-sonnet-20240620
+    litellm_params:
+      model: anthropic/claude-3-5-sonnet-20240620
+      api_key: os.environ/ANTHROPIC_API_KEY
+```
+
+#### Non-streaming
+```python showLineNumbers title="Anthropic Proxy Non-streaming Response"
+from openai import OpenAI
+
+# Initialize client with your proxy URL
+client = OpenAI(
+    base_url="http://localhost:4000",  # Your proxy URL
+    api_key="your-api-key"             # Your proxy API key
+)
+
+# Non-streaming response
+response = client.responses.create(
+    model="anthropic/claude-3-5-sonnet-20240620",
+    input="Tell me a three sentence bedtime story about a unicorn."
+)
+
+print(response)
+```
+
+#### Streaming
+```python showLineNumbers title="Anthropic Proxy Streaming Response"
+from openai import OpenAI
+
+# Initialize client with your proxy URL
+client = OpenAI(
+    base_url="http://localhost:4000",  # Your proxy URL
+    api_key="your-api-key"             # Your proxy API key
+)
+
+# Streaming response
+response = client.responses.create(
+    model="anthropic/claude-3-5-sonnet-20240620",
+    input="Tell me a three sentence bedtime story about a unicorn.",
+    stream=True
+)
+
+for event in response:
+    print(event)
+```
+
+</TabItem>
+
+<TabItem value="vertex" label="Vertex AI">
+
+First, add this to your litellm proxy config.yaml:
+```yaml showLineNumbers title="Vertex AI Proxy Configuration"
+model_list:
+  - model_name: vertex_ai/gemini-1.5-pro
+    litellm_params:
+      model: vertex_ai/gemini-1.5-pro
+      vertex_project: your-gcp-project-id
+      vertex_location: us-central1
+```
+
+#### Non-streaming
+```python showLineNumbers title="Vertex AI Proxy Non-streaming Response"
+from openai import OpenAI
+
+# Initialize client with your proxy URL
+client = OpenAI(
+    base_url="http://localhost:4000",  # Your proxy URL
+    api_key="your-api-key"             # Your proxy API key
+)
+
+# Non-streaming response
+response = client.responses.create(
+    model="vertex_ai/gemini-1.5-pro",
+    input="Tell me a three sentence bedtime story about a unicorn."
+)
+
+print(response)
+```
+
+#### Streaming
+```python showLineNumbers title="Vertex AI Proxy Streaming Response"
+from openai import OpenAI
+
+# Initialize client with your proxy URL
+client = OpenAI(
+    base_url="http://localhost:4000",  # Your proxy URL
+    api_key="your-api-key"             # Your proxy API key
+)
+
+# Streaming response
+response = client.responses.create(
+    model="vertex_ai/gemini-1.5-pro",
+    input="Tell me a three sentence bedtime story about a unicorn.",
+    stream=True
+)
+
+for event in response:
+    print(event)
+```
+
+</TabItem>
+
+<TabItem value="bedrock" label="AWS Bedrock">
+
+First, add this to your litellm proxy config.yaml:
+```yaml showLineNumbers title="AWS Bedrock Proxy Configuration"
+model_list:
+  - model_name: bedrock/anthropic.claude-3-sonnet-20240229-v1:0
+    litellm_params:
+      model: bedrock/anthropic.claude-3-sonnet-20240229-v1:0
+      aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID
+      aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY
+      aws_region_name: us-west-2
+```
+
+#### Non-streaming
+```python showLineNumbers title="AWS Bedrock Proxy Non-streaming Response"
+from openai import OpenAI
+
+# Initialize client with your proxy URL
+client = OpenAI(
+    base_url="http://localhost:4000",  # Your proxy URL
+    api_key="your-api-key"             # Your proxy API key
+)
+
+# Non-streaming response
+response = client.responses.create(
+    model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
+    input="Tell me a three sentence bedtime story about a unicorn."
+)
+
+print(response)
+```
+
+#### Streaming
+```python showLineNumbers title="AWS Bedrock Proxy Streaming Response"
+from openai import OpenAI
+
+# Initialize client with your proxy URL
+client = OpenAI(
+    base_url="http://localhost:4000",  # Your proxy URL
+    api_key="your-api-key"             # Your proxy API key
+)
+
+# Streaming response
+response = client.responses.create(
+    model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
+    input="Tell me a three sentence bedtime story about a unicorn.",
+    stream=True
+)
+
+for event in response:
+    print(event)
+```
+
+</TabItem>
+
+<TabItem value="gemini" label="Google AI Studio">
+
+First, add this to your litellm proxy config.yaml:
+```yaml showLineNumbers title="Google AI Studio Proxy Configuration"
+model_list:
+  - model_name: gemini/gemini-1.5-flash
+    litellm_params:
+      model: gemini/gemini-1.5-flash
+      api_key: os.environ/GEMINI_API_KEY
+```
+
+#### Non-streaming
+```python showLineNumbers title="Google AI Studio Proxy Non-streaming Response"
+from openai import OpenAI
+
+# Initialize client with your proxy URL
+client = OpenAI(
+    base_url="http://localhost:4000",  # Your proxy URL
+    api_key="your-api-key"             # Your proxy API key
+)
+
+# Non-streaming response
+response = client.responses.create(
+    model="gemini/gemini-1.5-flash",
+    input="Tell me a three sentence bedtime story about a unicorn."
+)
+
+print(response)
+```
+
+#### Streaming
+```python showLineNumbers title="Google AI Studio Proxy Streaming Response"
+from openai import OpenAI
+
+# Initialize client with your proxy URL
+client = OpenAI(
+    base_url="http://localhost:4000",  # Your proxy URL
+    api_key="your-api-key"             # Your proxy API key
+)
+
+# Streaming response
+response = client.responses.create(
+    model="gemini/gemini-1.5-flash",
+    input="Tell me a three sentence bedtime story about a unicorn.",
+    stream=True
+)
+
+for event in response:
+    print(event)
+```
+
+</TabItem>
+</Tabs>
+
+## Supported Responses API Parameters
+
+| Provider | Supported Parameters |
+|----------|---------------------|
+| `openai` | [All Responses API parameters are supported](https://github.com/BerriAI/litellm/blob/7c3df984da8e4dff9201e4c5353fdc7a2b441831/litellm/llms/openai/responses/transformation.py#L23) |
+| `azure` | [All Responses API parameters are supported](https://github.com/BerriAI/litellm/blob/7c3df984da8e4dff9201e4c5353fdc7a2b441831/litellm/llms/openai/responses/transformation.py#L23) |
+| `anthropic` | [See supported parameters here](https://github.com/BerriAI/litellm/blob/f39d9178868662746f159d5ef642c7f34f9bfe5f/litellm/responses/litellm_completion_transformation/transformation.py#L57) |
+| `bedrock` | [See supported parameters here](https://github.com/BerriAI/litellm/blob/f39d9178868662746f159d5ef642c7f34f9bfe5f/litellm/responses/litellm_completion_transformation/transformation.py#L57) |
+| `gemini` | [See supported parameters here](https://github.com/BerriAI/litellm/blob/f39d9178868662746f159d5ef642c7f34f9bfe5f/litellm/responses/litellm_completion_transformation/transformation.py#L57) |
+| `vertex_ai` | [See supported parameters here](https://github.com/BerriAI/litellm/blob/f39d9178868662746f159d5ef642c7f34f9bfe5f/litellm/responses/litellm_completion_transformation/transformation.py#L57) |
+| `azure_ai` | [See supported parameters here](https://github.com/BerriAI/litellm/blob/f39d9178868662746f159d5ef642c7f34f9bfe5f/litellm/responses/litellm_completion_transformation/transformation.py#L57) |
+| All other llm api providers | [See supported parameters here](https://github.com/BerriAI/litellm/blob/f39d9178868662746f159d5ef642c7f34f9bfe5f/litellm/responses/litellm_completion_transformation/transformation.py#L57) |
+
--- a/docs/my-website/docs/tutorials/openai_codex.md
+++ b/docs/my-website/docs/tutorials/openai_codex.md
@ -0,0 +1,146 @@
+import Image from '@theme/IdealImage';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Using LiteLLM with OpenAI Codex
+
+This guide walks you through connecting OpenAI Codex to LiteLLM. Using LiteLLM with Codex allows teams to:
+- Access 100+ LLMs through the Codex interface
+- Use powerful models like Gemini through a familiar interface
+- Track spend and usage with LiteLLM's built-in analytics
+- Control model access with virtual keys
+
+<Image img={require('../../img/litellm_codex.gif')} />
+
+## Quickstart
+
+:::info
+
+Requires LiteLLM v1.66.3.dev5 and higher
+
+:::
+
+
+Make sure to set up LiteLLM with the [LiteLLM Getting Started Guide](../proxy/docker_quick_start.md).
+
+## 1. Install OpenAI Codex
+
+Install the OpenAI Codex CLI tool globally using npm:
+
+<Tabs>
+<TabItem value="npm" label="npm">
+
+```bash showLineNumbers
+npm i -g @openai/codex
+```
+
+</TabItem>
+<TabItem value="yarn" label="yarn">
+
+```bash showLineNumbers
+yarn global add @openai/codex
+```
+
+</TabItem>
+</Tabs>
+
+## 2. Start LiteLLM Proxy
+
+<Tabs>
+<TabItem value="docker" label="Docker">
+
+```bash showLineNumbers
+docker run \
+    -v $(pwd)/litellm_config.yaml:/app/config.yaml \
+    -p 4000:4000 \
+    ghcr.io/berriai/litellm:main-latest \
+    --config /app/config.yaml
+```
+
+</TabItem>
+<TabItem value="pip" label="LiteLLM CLI">
+
+```bash showLineNumbers
+litellm --config /path/to/config.yaml
+```
+
+</TabItem>
+</Tabs>
+
+LiteLLM should now be running on [http://localhost:4000](http://localhost:4000)
+
+## 3. Configure LiteLLM for Model Routing
+
+Ensure your LiteLLM Proxy is properly configured to route to your desired models. Create a `litellm_config.yaml` file with the following content:
+
+```yaml showLineNumbers
+model_list:
+  - model_name: o3-mini
+    litellm_params:
+      model: openai/o3-mini
+      api_key: os.environ/OPENAI_API_KEY
+  - model_name: claude-3-7-sonnet-latest
+    litellm_params:
+      model: anthropic/claude-3-7-sonnet-latest
+      api_key: os.environ/ANTHROPIC_API_KEY
+  - model_name: gemini-2.0-flash
+    litellm_params:
+      model: gemini/gemini-2.0-flash
+      api_key: os.environ/GEMINI_API_KEY
+
+litellm_settings:
+  drop_params: true
+```
+
+This configuration enables routing to specific OpenAI, Anthropic, and Gemini models with explicit names.
+
+## 4. Configure Codex to Use LiteLLM Proxy
+
+Set the required environment variables to point Codex to your LiteLLM Proxy:
+
+```bash
+# Point to your LiteLLM Proxy server
+export OPENAI_BASE_URL=http://0.0.0.0:4000 
+
+# Use your LiteLLM API key (if you've set up authentication)
+export OPENAI_API_KEY="sk-1234"
+```
+
+## 5. Run Codex with Gemini
+
+With everything configured, you can now run Codex with Gemini:
+
+```bash showLineNumbers
+codex --model gemini-2.0-flash --full-auto
+```
+
+<Image img={require('../../img/litellm_codex.gif')} />
+
+The `--full-auto` flag allows Codex to automatically generate code without additional prompting.
+
+## 6. Advanced Options
+
+### Using Different Models
+
+You can use any model configured in your LiteLLM proxy:
+
+```bash
+# Use Claude models
+codex --model claude-3-7-sonnet-latest
+
+# Use Google AI Studio Gemini models
+codex --model gemini/gemini-2.0-flash
+```
+
+## Troubleshooting
+
+- If you encounter connection issues, ensure your LiteLLM Proxy is running and accessible at the specified URL
+- Verify your LiteLLM API key is valid if you're using authentication
+- Check that your model routing configuration is correct
+- For model-specific errors, ensure the model is properly configured in your LiteLLM setup
+
+## Additional Resources
+
+- [LiteLLM Docker Quick Start Guide](../proxy/docker_quick_start.md)
+- [OpenAI Codex GitHub Repository](https://github.com/openai/codex)
+- [LiteLLM Virtual Keys and Authentication](../proxy/virtual_keys.md)
--- a/docs/my-website/docs/tutorials/prompt_caching.md
+++ b/docs/my-website/docs/tutorials/prompt_caching.md
@ -0,0 +1,128 @@
+import Image from '@theme/IdealImage';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Auto-Inject Prompt Caching Checkpoints
+
+Reduce costs by up to 90% by using LiteLLM to auto-inject prompt caching checkpoints.
+
+<Image img={require('../../img/auto_prompt_caching.png')}  style={{ width: '800px', height: 'auto' }} />
+
+
+## How it works
+
+LiteLLM can automatically inject prompt caching checkpoints into your requests to LLM providers. This allows:
+
+- **Cost Reduction**: Long, static parts of your prompts can be cached to avoid repeated processing
+- **No need to modify your application code**: You can configure the auto-caching behavior in the LiteLLM UI or in the `litellm config.yaml` file.
+
+## Configuration
+
+You need to specify `cache_control_injection_points` in your model configuration. This tells LiteLLM:
+1. Where to add the caching directive (`location`)
+2. Which message to target (`role`)
+
+LiteLLM will then automatically add a `cache_control` directive to the specified messages in your requests:
+
+```json
+"cache_control": {
+    "type": "ephemeral"
+}
+```
+
+## Usage Example 
+
+In this example, we'll configure caching for system messages by adding the directive to all messages with `role: system`.
+
+<Tabs>
+<TabItem value="litellm config.yaml" label="litellm config.yaml">
+
+```yaml showLineNumbers title="litellm config.yaml"
+model_list:
+  - model_name: anthropic-auto-inject-cache-system-message
+    litellm_params:
+      model: anthropic/claude-3-5-sonnet-20240620
+      api_key: os.environ/ANTHROPIC_API_KEY
+      cache_control_injection_points:
+        - location: message
+          role: system
+```
+</TabItem>
+
+<TabItem value="UI" label="LiteLLM UI">
+
+On the LiteLLM UI, you can specify the `cache_control_injection_points` in the `Advanced Settings` tab when adding a model.
+<Image img={require('../../img/ui_auto_prompt_caching.png')}/>
+
+</TabItem>
+</Tabs>
+
+
+## Detailed Example
+
+### 1. Original Request to LiteLLM 
+
+In this example, we have a very long, static system message and a varying user message. It's efficient to cache the system message since it rarely changes.
+
+```json
+{
+    "messages": [
+        {
+            "role": "system",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "You are a helpful assistant. This is a set of very long instructions that you will follow. Here is a legal document that you will use to answer the user's question."
+                }
+            ]
+        },
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What is the main topic of this legal document?"
+                }
+            ]
+        }
+    ]
+}
+```
+
+### 2. LiteLLM's Modified Request
+
+LiteLLM auto-injects the caching directive into the system message based on our configuration:
+
+```json
+{
+    "messages": [
+        {
+            "role": "system",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "You are a helpful assistant. This is a set of very long instructions that you will follow. Here is a legal document that you will use to answer the user's question.",
+                    "cache_control": {"type": "ephemeral"}
+                }
+            ]
+        },
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What is the main topic of this legal document?"
+                }
+            ]
+        }
+    ]
+}
+```
+
+When the model provider processes this request, it will recognize the caching directive and only process the system message once, caching it for subsequent requests.
+
+
+    
+
+
+
--- a/docs/my-website/docs/tutorials/scim_litellm.md
+++ b/docs/my-website/docs/tutorials/scim_litellm.md
@ -0,0 +1,74 @@
+
+import Image from '@theme/IdealImage';
+
+# SCIM with LiteLLM
+
+Enables identity providers (Okta, Azure AD, OneLogin, etc.) to automate user and team (group) provisioning, updates, and deprovisioning on LiteLLM.
+
+
+This tutorial will walk you through the steps to connect your IDP to LiteLLM SCIM Endpoints.
+
+### Supported SSO Providers for SCIM
+Below is a list of supported SSO providers for connecting to LiteLLM SCIM Endpoints.
+- Microsoft Entra ID (Azure AD)
+- Okta
+- Google Workspace
+- OneLogin
+- Keycloak
+- Auth0
+
+
+## 1. Get your SCIM Tenant URL and Bearer Token
+
+On LiteLLM, navigate to the Settings > Admin Settings > SCIM. On this page you will create a SCIM Token, this allows your IDP to authenticate to litellm `/scim` endpoints.
+
+<Image img={require('../../img/scim_2.png')}  style={{ width: '800px', height: 'auto' }} />
+
+## 2. Connect your IDP to LiteLLM SCIM Endpoints
+
+On your IDP provider, navigate to your SSO application and select `Provisioning` > `New provisioning configuration`.
+
+On this page, paste in your litellm scim tenant url and bearer token.
+
+Once this is pasted in, click on `Test Connection` to ensure your IDP can authenticate to the LiteLLM SCIM endpoints.
+
+<Image img={require('../../img/scim_4.png')}  style={{ width: '800px', height: 'auto' }} />
+
+
+## 3. Test SCIM Connection
+
+### 3.1 Assign the group to your LiteLLM Enterprise App
+
+On your IDP Portal, navigate to `Enterprise Applications` > Select your litellm app 
+
+<Image img={require('../../img/msft_enterprise_app.png')}  style={{ width: '800px', height: 'auto' }} />
+
+<br />
+<br />
+
+Once you've selected your litellm app, click on `Users and Groups` > `Add user/group` 
+
+<Image img={require('../../img/msft_enterprise_assign_group.png')}  style={{ width: '800px', height: 'auto' }} />
+
+<br />
+
+Now select the group you created in step 1.1. And add it to the LiteLLM Enterprise App. At this point we have added `Production LLM Evals Group` to the LiteLLM Enterprise App. The next step is having LiteLLM automatically create the `Production LLM Evals Group` on the LiteLLM DB when a new user signs in.
+
+<Image img={require('../../img/msft_enterprise_select_group.png')}  style={{ width: '800px', height: 'auto' }} />
+
+
+### 3.2 Sign in to LiteLLM UI via SSO
+
+Sign into the LiteLLM UI via SSO. You should be redirected to the Entra ID SSO page. This SSO sign in flow will trigger LiteLLM to fetch the latest Groups and Members from Azure Entra ID.
+
+<Image img={require('../../img/msft_sso_sign_in.png')}  style={{ width: '800px', height: 'auto' }} />
+
+### 3.3 Check the new team on LiteLLM UI
+
+On the LiteLLM UI, Navigate to `Teams`, You should see the new team `Production LLM Evals Group` auto-created on LiteLLM. 
+
+<Image img={require('../../img/msft_auto_team.png')}  style={{ width: '900px', height: 'auto' }} />
+
+
+
+
--- a/docs/my-website/docs/tutorials/tag_management.md
+++ b/docs/my-website/docs/tutorials/tag_management.md
@ -0,0 +1,145 @@
+import Image from '@theme/IdealImage';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# [Beta] Routing based on request metadata
+
+Create routing rules based on request metadata.
+
+## Setup
+
+Add the following to your litellm proxy config yaml file.
+
+```yaml showLineNumbers title="litellm proxy config.yaml"
+router_settings:
+  enable_tag_filtering: True # 👈 Key Change
+```
+
+## 1. Create a tag
+
+On the LiteLLM UI, navigate to Experimental > Tag Management > Create Tag.
+
+Create a tag called `private-data` and only select the allowed models for requests with this tag. Once created, you will see the tag in the Tag Management page.
+
+<Image img={require('../../img/tag_create.png')}  style={{ width: '800px', height: 'auto' }} />
+
+
+## 2. Test Tag Routing
+
+Now we will test the tag based routing rules.
+
+### 2.1 Invalid model
+
+This request will fail since we send `tags=private-data` but the model `gpt-4o` is not in the allowed models for the `private-data` tag.
+
+<Image img={require('../../img/tag_invalid.png')}  style={{ width: '800px', height: 'auto' }} />
+
+<br />
+
+Here is an example sending the same request using the OpenAI Python SDK.
+<Tabs>
+<TabItem value="python" label="OpenAI Python SDK">
+
+```python showLineNumbers
+from openai import OpenAI
+
+client = OpenAI(
+    api_key="sk-1234",
+    base_url="http://0.0.0.0:4000/v1/"
+)
+
+response = client.chat.completions.create(
+    model="gpt-4o",
+    messages=[
+        {"role": "user", "content": "Hello, how are you?"}
+    ],
+    extra_body={
+        "tags": "private-data"
+    }
+)
+```
+
+</TabItem>
+<TabItem value="curl" label="cURL">
+
+```bash
+curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+  "model": "gpt-4o",
+  "messages": [
+    {
+      "role": "user",
+      "content": "Hello, how are you?"
+    }
+  ],
+  "tags": "private-data"
+}'
+```
+
+</TabItem>
+</Tabs>
+
+<br />
+
+### 2.2 Valid model
+
+This request will succeed since we send `tags=private-data` and the model `us.anthropic.claude-3-7-sonnet-20250219-v1:0` is in the allowed models for the `private-data` tag.
+
+<Image img={require('../../img/tag_valid.png')}  style={{ width: '800px', height: 'auto' }} />
+
+Here is an example sending the same request using the OpenAI Python SDK.
+
+<Tabs>
+<TabItem value="python" label="OpenAI Python SDK">
+
+```python showLineNumbers
+from openai import OpenAI
+
+client = OpenAI(
+    api_key="sk-1234",
+    base_url="http://0.0.0.0:4000/v1/"
+)
+
+response = client.chat.completions.create(
+    model="us.anthropic.claude-3-7-sonnet-20250219-v1:0",
+    messages=[
+        {"role": "user", "content": "Hello, how are you?"}
+    ],
+    extra_body={
+        "tags": "private-data"
+    }
+)
+```
+
+</TabItem>
+<TabItem value="curl" label="cURL">
+
+```bash
+curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+  "model": "us.anthropic.claude-3-7-sonnet-20250219-v1:0",
+  "messages": [
+    {
+      "role": "user",
+      "content": "Hello, how are you?"
+    }
+  ],
+  "tags": "private-data"
+}'
+```
+
+</TabItem>
+</Tabs>
+
+
+
+## Additional Tag Features
+- [Sending tags in request headers](https://docs.litellm.ai/docs/proxy/tag_routing#calling-via-request-header)
+- [Tag based routing](https://docs.litellm.ai/docs/proxy/tag_routing)
+- [Track spend per tag](cost_tracking#-custom-tags)
+- [Setup Budgets per Virtual Key, Team](users)
+
--- a/docs/my-website/img/auto_prompt_caching.png
+++ b/docs/my-website/img/auto_prompt_caching.png
--- a/docs/my-website/img/litellm_codex.gif
+++ b/docs/my-website/img/litellm_codex.gif
--- a/docs/my-website/img/managed_files_arch.png
+++ b/docs/my-website/img/managed_files_arch.png
--- a/docs/my-website/img/realtime_api.png
+++ b/docs/my-website/img/realtime_api.png
--- a/docs/my-website/img/release_notes/chat_metrics.png
+++ b/docs/my-website/img/release_notes/chat_metrics.png
--- a/docs/my-website/img/release_notes/new_tag_usage.png
+++ b/docs/my-website/img/release_notes/new_tag_usage.png
--- a/docs/my-website/img/release_notes/new_team_usage.png
+++ b/docs/my-website/img/release_notes/new_team_usage.png
--- a/docs/my-website/img/release_notes/new_team_usage_highlight.jpg
+++ b/docs/my-website/img/release_notes/new_team_usage_highlight.jpg
--- a/docs/my-website/img/release_notes/sso_sync.png
+++ b/docs/my-website/img/release_notes/sso_sync.png
--- a/docs/my-website/img/release_notes/tag_management.png
+++ b/docs/my-website/img/release_notes/tag_management.png
--- a/docs/my-website/img/release_notes/unified_responses_api_rn.png
+++ b/docs/my-website/img/release_notes/unified_responses_api_rn.png
--- a/docs/my-website/img/scim_0.png
+++ b/docs/my-website/img/scim_0.png
--- a/docs/my-website/img/scim_1.png
+++ b/docs/my-website/img/scim_1.png
--- a/docs/my-website/img/scim_2.png
+++ b/docs/my-website/img/scim_2.png
--- a/docs/my-website/img/scim_3.png
+++ b/docs/my-website/img/scim_3.png
--- a/docs/my-website/img/scim_4.png
+++ b/docs/my-website/img/scim_4.png
--- a/docs/my-website/img/scim_integration.png
+++ b/docs/my-website/img/scim_integration.png
--- a/docs/my-website/img/tag_create.png
+++ b/docs/my-website/img/tag_create.png
--- a/docs/my-website/img/tag_invalid.png
+++ b/docs/my-website/img/tag_invalid.png
--- a/docs/my-website/img/tag_valid.png
+++ b/docs/my-website/img/tag_valid.png
--- a/docs/my-website/img/ui_auto_prompt_caching.png
+++ b/docs/my-website/img/ui_auto_prompt_caching.png
--- a/docs/my-website/package-lock.json
+++ b/docs/my-website/package-lock.json
@ -2148,9 +2148,10 @@
      }
    },
    "node_modules/@babel/runtime": {
-      "version": "7.26.0",
-      "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.26.0.tgz",
-      "integrity": "sha512-FDSOghenHTiToteC/QRlv2q3DhPZ/oOXTBoirfWNx1Cx3TMVcGWQtMMmQcSvb/JjpNeGzx8Pq/b4fKEJuWm1sw==",
+      "version": "7.27.0",
+      "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.27.0.tgz",
+      "integrity": "sha512-VtPOkrdPHZsKc/clNqyi9WUA8TINkZ4cGk63UUE3u4pmB2k+ZMQRDuIOagv8UVd6j7k0T3+RRIb7beKTebNbcw==",
+      "license": "MIT",
      "dependencies": {
        "regenerator-runtime": "^0.14.0"
      },
@ -12454,9 +12455,10 @@
      }
    },
    "node_modules/http-proxy-middleware": {
-      "version": "2.0.7",
-      "resolved": "https://registry.npmjs.org/http-proxy-middleware/-/http-proxy-middleware-2.0.7.tgz",
-      "integrity": "sha512-fgVY8AV7qU7z/MmXJ/rxwbrtQH4jBQ9m7kp3llF0liB7glmFeVZFBepQb32T3y8n8k2+AEYuMPCpinYW+/CuRA==",
+      "version": "2.0.9",
+      "resolved": "https://registry.npmjs.org/http-proxy-middleware/-/http-proxy-middleware-2.0.9.tgz",
+      "integrity": "sha512-c1IyJYLYppU574+YI7R4QyX2ystMtVXZwIdzazUIPIJsHuWNd+mho2j+bKoHftndicGj9yh+xjd+l0yj7VeT1Q==",
+      "license": "MIT",
      "dependencies": {
        "@types/http-proxy": "^1.17.8",
        "http-proxy": "^1.18.1",
--- a/docs/my-website/release_notes/v1.57.8-stable/index.md
+++ b/docs/my-website/release_notes/v1.57.8-stable/index.md
@ -38,7 +38,7 @@ hide_table_of_contents: false
 2. OpenAI Moderations - `omni-moderation-latest` support. [Start Here](https://docs.litellm.ai/docs/moderation)
 3. Azure O1 - fake streaming support. This ensures if a `stream=true` is passed, the response is streamed. [Start Here](https://docs.litellm.ai/docs/providers/azure)
 4. Anthropic - non-whitespace char stop sequence handling - [PR](https://github.com/BerriAI/litellm/pull/7484)
-5. Azure OpenAI - support Entra id username + password based auth. [Start Here](https://docs.litellm.ai/docs/providers/azure#entrata-id---use-tenant_id-client_id-client_secret)
+5. Azure OpenAI - support Entra ID username + password based auth. [Start Here](https://docs.litellm.ai/docs/providers/azure#entra-id---use-tenant_id-client_id-client_secret)
 6. LM Studio - embedding route support. [Start Here](https://docs.litellm.ai/docs/providers/lm-studio)
 7. WatsonX - ZenAPIKeyAuth support. [Start Here](https://docs.litellm.ai/docs/providers/watsonx)
    
--- a/docs/my-website/release_notes/v1.66.0-stable/index.md
+++ b/docs/my-website/release_notes/v1.66.0-stable/index.md
@ -0,0 +1,197 @@
+---
+title: v1.66.0-stable - Realtime API Cost Tracking
+slug: v1.66.0-stable
+date: 2025-04-12T10:00:00
+authors:
+  - name: Krrish Dholakia
+    title: CEO, LiteLLM
+    url: https://www.linkedin.com/in/krish-d/
+    image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1749686400&v=beta&t=Hkl3U8Ps0VtvNxX0BNNq24b4dtX5wQaPFp6oiKCIHD8
+  - name: Ishaan Jaffer
+    title: CTO, LiteLLM
+    url: https://www.linkedin.com/in/reffajnaahsi/
+    image_url: https://pbs.twimg.com/profile_images/1613813310264340481/lz54oEiB_400x400.jpg
+
+tags: ["sso", "unified_file_id", "cost_tracking", "security"]
+hide_table_of_contents: false
+---
+
+import Image from '@theme/IdealImage';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+## Deploy this version
+
+<Tabs>
+<TabItem value="docker" label="Docker">
+
+``` showLineNumbers title="docker run litellm"
+docker run
+-e STORE_MODEL_IN_DB=True
+-p 4000:4000
+ghcr.io/berriai/litellm:main-v1.66.0-stable
+```
+</TabItem>
+
+<TabItem value="pip" label="Pip">
+
+``` showLineNumbers title="pip install litellm"
+pip install litellm==1.66.0.post1
+```
+</TabItem>
+</Tabs>
+
+v1.66.0-stable is live now, here are the key highlights of this release
+
+## Key Highlights
+- **Realtime API Cost Tracking**: Track cost of realtime API calls
+- **Microsoft SSO Auto-sync**: Auto-sync groups and group members from Azure Entra ID to LiteLLM
+- **xAI grok-3**: Added support for `xai/grok-3` models
+- **Security Fixes**: Fixed [CVE-2025-0330](https://www.cve.org/CVERecord?id=CVE-2025-0330) and [CVE-2024-6825](https://www.cve.org/CVERecord?id=CVE-2024-6825) vulnerabilities
+
+Let's dive in.
+
+## Realtime API Cost Tracking
+
+<Image 
+  img={require('../../img/realtime_api.png')}
+  style={{width: '100%', display: 'block'}}
+/>
+
+
+This release adds Realtime API logging + cost tracking. 
+- **Logging**: LiteLLM now logs the complete response from realtime calls to all logging integrations (DB, S3, Langfuse, etc.) 
+- **Cost Tracking**: You can now set 'base_model' and custom pricing for realtime models. [Custom Pricing](../../docs/proxy/custom_pricing)
+- **Budgets**: Your key/user/team budgets now work for realtime models as well.
+
+Start [here](https://docs.litellm.ai/docs/realtime)
+
+
+
+## Microsoft SSO Auto-sync
+
+<Image 
+  img={require('../../img/release_notes/sso_sync.png')}
+  style={{width: '100%', display: 'block'}}
+/>
+<p style={{textAlign: 'left', color: '#666'}}>
+  Auto-sync groups and members from Azure Entra ID to LiteLLM
+</p>
+
+This release adds support for auto-syncing groups and members on Microsoft Entra ID with LiteLLM. This means that LiteLLM proxy administrators can spend less time managing teams and members and LiteLLM handles the following: 
+
+- Auto-create teams that exist on Microsoft Entra ID 
+- Sync team members on Microsoft Entra ID with LiteLLM teams
+
+Get started with this [here](https://docs.litellm.ai/docs/tutorials/msft_sso)
+
+
+## New Models / Updated Models
+
+- **xAI**
+    1. Added reasoning_effort support for `xai/grok-3-mini-beta` [Get Started](https://docs.litellm.ai/docs/providers/xai#reasoning-usage)
+    2. Added cost tracking for `xai/grok-3` models [PR](https://github.com/BerriAI/litellm/pull/9920)
+
+- **Hugging Face**
+    1. Added inference providers support [Get Started](https://docs.litellm.ai/docs/providers/huggingface#serverless-inference-providers)
+
+- **Azure**
+    1. Added azure/gpt-4o-realtime-audio cost tracking [PR](https://github.com/BerriAI/litellm/pull/9893)
+
+- **VertexAI**
+    1. Added enterpriseWebSearch tool support [Get Started](https://docs.litellm.ai/docs/providers/vertex#grounding---web-search)
+    2. Moved to only passing keys accepted by the Vertex AI response schema [PR](https://github.com/BerriAI/litellm/pull/8992)
+
+- **Google AI Studio**
+    1. Added cost tracking for `gemini-2.5-pro` [PR](https://github.com/BerriAI/litellm/pull/9837)
+    2. Fixed pricing for 'gemini/gemini-2.5-pro-preview-03-25' [PR](https://github.com/BerriAI/litellm/pull/9896)
+    3. Fixed handling file_data being passed in [PR](https://github.com/BerriAI/litellm/pull/9786)
+
+- **Azure**
+    1. Updated Azure Phi-4 pricing [PR](https://github.com/BerriAI/litellm/pull/9862)
+    2. Added azure/gpt-4o-realtime-audio cost tracking [PR](https://github.com/BerriAI/litellm/pull/9893)
+
+- **Databricks**
+    1. Removed reasoning_effort from parameters [PR](https://github.com/BerriAI/litellm/pull/9811)
+    2. Fixed custom endpoint check for Databricks [PR](https://github.com/BerriAI/litellm/pull/9925)
+
+- **General**
+    1. Added litellm.supports_reasoning() util to track if an llm supports reasoning [Get Started](https://docs.litellm.ai/docs/providers/anthropic#reasoning)
+    2. Function Calling - Handle pydantic base model in message tool calls, handle tools = [], and support fake streaming on tool calls for meta.llama3-3-70b-instruct-v1:0 [PR](https://github.com/BerriAI/litellm/pull/9774)
+    3. LiteLLM Proxy - Allow passing `thinking` param to litellm proxy via client sdk [PR](https://github.com/BerriAI/litellm/pull/9386)
+    4. Fixed correctly translating 'thinking' param for litellm [PR](https://github.com/BerriAI/litellm/pull/9904)
+
+
+## Spend Tracking Improvements
+- **OpenAI, Azure**
+    1. Realtime API Cost tracking with token usage metrics in spend logs [Get Started](https://docs.litellm.ai/docs/realtime)
+- **Anthropic**
+    1. Fixed Claude Haiku cache read pricing per token [PR](https://github.com/BerriAI/litellm/pull/9834)
+    2. Added cost tracking for Claude responses with base_model [PR](https://github.com/BerriAI/litellm/pull/9897)
+    3. Fixed Anthropic prompt caching cost calculation and trimmed logged message in db [PR](https://github.com/BerriAI/litellm/pull/9838)
+- **General**
+    1. Added token tracking and log usage object in spend logs [PR](https://github.com/BerriAI/litellm/pull/9843)
+    2. Handle custom pricing at deployment level [PR](https://github.com/BerriAI/litellm/pull/9855)
+
+
+## Management Endpoints / UI
+
+- **Test Key Tab**
+    1. Added rendering of Reasoning content, ttft, usage metrics on test key page [PR](https://github.com/BerriAI/litellm/pull/9931)
+
+    <Image 
+    img={require('../../img/release_notes/chat_metrics.png')}
+    style={{width: '100%', display: 'block'}}
+    />
+    <p style={{textAlign: 'left', color: '#666'}}>
+    View input, output, reasoning tokens, ttft metrics.
+    </p>
+- **Tag / Policy Management**
+    1. Added Tag/Policy Management. Create routing rules based on request metadata. This allows you to enforce that requests with `tags="private"` only go to specific models. [Get Started](https://docs.litellm.ai/docs/tutorials/tag_management)
+
+    <br />
+
+    <Image 
+    img={require('../../img/release_notes/tag_management.png')}
+    style={{width: '100%', display: 'block'}}
+    />
+    <p style={{textAlign: 'left', color: '#666'}}>
+    Create and manage tags.
+    </p>
+- **Redesigned Login Screen**
+    1. Polished login screen [PR](https://github.com/BerriAI/litellm/pull/9778)
+- **Microsoft SSO Auto-Sync**
+    1. Added debug route to allow admins to debug SSO JWT fields [PR](https://github.com/BerriAI/litellm/pull/9835)
+    2. Added ability to use MSFT Graph API to assign users to teams [PR](https://github.com/BerriAI/litellm/pull/9865)
+    3. Connected litellm to Azure Entra ID Enterprise Application [PR](https://github.com/BerriAI/litellm/pull/9872)
+    4. Added ability for admins to set `default_team_params` for when litellm SSO creates default teams [PR](https://github.com/BerriAI/litellm/pull/9895)
+    5. Fixed MSFT SSO to use correct field for user email [PR](https://github.com/BerriAI/litellm/pull/9886)
+    6. Added UI support for setting Default Team setting when litellm SSO auto creates teams [PR](https://github.com/BerriAI/litellm/pull/9918)
+- **UI Bug Fixes**
+    1. Prevented team, key, org, model numerical values changing on scrolling [PR](https://github.com/BerriAI/litellm/pull/9776)
+    2. Instantly reflect key and team updates in UI [PR](https://github.com/BerriAI/litellm/pull/9825)
+
+## Logging / Guardrail Improvements
+
+- **Prometheus**
+    1. Emit Key and Team Budget metrics on a cron job schedule [Get Started](https://docs.litellm.ai/docs/proxy/prometheus#initialize-budget-metrics-on-startup)
+
+## Security Fixes
+
+- Fixed [CVE-2025-0330](https://www.cve.org/CVERecord?id=CVE-2025-0330) - Leakage of Langfuse API keys in team exception handling [PR](https://github.com/BerriAI/litellm/pull/9830)
+- Fixed [CVE-2024-6825](https://www.cve.org/CVERecord?id=CVE-2024-6825) - Remote code execution in post call rules [PR](https://github.com/BerriAI/litellm/pull/9826)
+
+## Helm
+
+- Added service annotations to litellm-helm chart [PR](https://github.com/BerriAI/litellm/pull/9840)
+- Added extraEnvVars to the helm deployment [PR](https://github.com/BerriAI/litellm/pull/9292)
+
+## Demo
+
+Try this on the demo instance [today](https://docs.litellm.ai/docs/proxy/demo)
+
+## Complete Git Diff
+
+See the complete git diff since v1.65.4-stable, [here](https://github.com/BerriAI/litellm/releases/tag/v1.66.0-stable)
+
+
--- a/docs/my-website/release_notes/v1.67.0-stable/index.md
+++ b/docs/my-website/release_notes/v1.67.0-stable/index.md
@ -0,0 +1,153 @@
+---
+title: v1.67.0-stable - SCIM Integration
+slug: v1.67.0-stable
+date: 2025-04-19T10:00:00
+authors:
+  - name: Krrish Dholakia
+    title: CEO, LiteLLM
+    url: https://www.linkedin.com/in/krish-d/
+    image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1749686400&v=beta&t=Hkl3U8Ps0VtvNxX0BNNq24b4dtX5wQaPFp6oiKCIHD8
+  - name: Ishaan Jaffer
+    title: CTO, LiteLLM
+    url: https://www.linkedin.com/in/reffajnaahsi/
+    image_url: https://pbs.twimg.com/profile_images/1613813310264340481/lz54oEiB_400x400.jpg
+
+tags: ["sso", "unified_file_id", "cost_tracking", "security"]
+hide_table_of_contents: false
+---
+import Image from '@theme/IdealImage';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+## Key Highlights
+
+- **SCIM Integration**: Enables identity providers (Okta, Azure AD, OneLogin, etc.) to automate user and team (group) provisioning, updates, and deprovisioning
+- **Team and Tag based usage tracking**: You can now see usage and spend by team and tag at 1M+ spend logs.
+- **Unified Responses API**: Support for calling Anthropic, Gemini, Groq, etc. via OpenAI's new Responses API.
+
+Let's dive in.
+
+## SCIM Integration
+
+<Image img={require('../../img/scim_integration.png')}/>
+
+This release adds SCIM support to LiteLLM. This allows your SSO provider (Okta, Azure AD, etc) to automatically create/delete users, teams, and memberships on LiteLLM. This means that when you remove a team on your SSO provider, your SSO provider will automatically delete the corresponding team on LiteLLM. 
+
+[Read more](../../docs/tutorials/scim_litellm)
+## Team and Tag based usage tracking
+
+<Image img={require('../../img/release_notes/new_team_usage_highlight.jpg')}/>
+
+
+This release improves team and tag based usage tracking at 1m+ spend logs, making it easy to monitor your LLM API Spend in production. This covers:
+
+- View **daily spend** by teams + tags
+- View **usage / spend by key**, within teams
+- View **spend by multiple tags**
+- Allow **internal users** to view spend of teams they're a member of
+
+[Read more](#management-endpoints--ui)
+
+## Unified Responses API
+
+This release allows you to call Azure OpenAI, Anthropic, AWS Bedrock, and Google Vertex AI models via the POST /v1/responses endpoint on LiteLLM. This means you can now use popular tools like [OpenAI Codex](https://docs.litellm.ai/docs/tutorials/openai_codex) with your own models. 
+
+<Image img={require('../../img/release_notes/unified_responses_api_rn.png')}/>
+
+
+[Read more](https://docs.litellm.ai/docs/response_api)
+
+
+## New Models / Updated Models
+
+- **OpenAI**
+    1. gpt-4.1, gpt-4.1-mini, gpt-4.1-nano, o3, o3-mini, o4-mini pricing - [Get Started](../../docs/providers/openai#usage), [PR](https://github.com/BerriAI/litellm/pull/9990)
+    2. o4 - correctly map o4 to openai o_series model
+- **Azure AI**
+    1. Phi-4 output cost per token fix - [PR](https://github.com/BerriAI/litellm/pull/9880)
+    2. Responses API support [Get Started](../../docs/providers/azure#azure-responses-api),[PR](https://github.com/BerriAI/litellm/pull/10116)
+- **Anthropic**
+    1. redacted message thinking support - [Get Started](../../docs/providers/anthropic#usage---thinking--reasoning_content),[PR](https://github.com/BerriAI/litellm/pull/10129)
+- **Cohere**
+    1. `/v2/chat` Passthrough endpoint support w/ cost tracking - [Get Started](../../docs/pass_through/cohere), [PR](https://github.com/BerriAI/litellm/pull/9997)
+- **Azure**
+    1. Support azure tenant_id/client_id env vars - [Get Started](../../docs/providers/azure#entra-id---use-tenant_id-client_id-client_secret), [PR](https://github.com/BerriAI/litellm/pull/9993)
+    2. Fix response_format check for 2025+ api versions - [PR](https://github.com/BerriAI/litellm/pull/9993)
+    3. Add gpt-4.1, gpt-4.1-mini, gpt-4.1-nano, o3, o3-mini, o4-mini pricing
+- **VLLM**
+    1. Files - Support 'file' message type for VLLM video url's - [Get Started](../../docs/providers/vllm#send-video-url-to-vllm), [PR](https://github.com/BerriAI/litellm/pull/10129)
+    2. Passthrough - new `/vllm/` passthrough endpoint support [Get Started](../../docs/pass_through/vllm), [PR](https://github.com/BerriAI/litellm/pull/10002)
+- **Mistral**
+    1. new `/mistral` passthrough endpoint support [Get Started](../../docs/pass_through/mistral), [PR](https://github.com/BerriAI/litellm/pull/10002)
+- **AWS**
+    1. New mapped bedrock regions - [PR](https://github.com/BerriAI/litellm/pull/9430)
+- **VertexAI / Google AI Studio**
+    1. Gemini - Response format - Retain schema field ordering for google gemini and vertex by specifying propertyOrdering - [Get Started](../../docs/providers/vertex#json-schema), [PR](https://github.com/BerriAI/litellm/pull/9828)
+    2. Gemini-2.5-flash - return reasoning content [Google AI Studio](../../docs/providers/gemini#usage---thinking--reasoning_content), [Vertex AI](../../docs/providers/vertex#thinking--reasoning_content)
+    3. Gemini-2.5-flash - pricing + model information [PR](https://github.com/BerriAI/litellm/pull/10125)
+    4. Passthrough - new `/vertex_ai/discovery` route - enables calling AgentBuilder API routes [Get Started](../../docs/pass_through/vertex_ai#supported-api-endpoints), [PR](https://github.com/BerriAI/litellm/pull/10084)
+- **Fireworks AI**
+    1. return tool calling responses in `tool_calls` field (fireworks incorrectly returns this as a json str in content) [PR](https://github.com/BerriAI/litellm/pull/10130)
+- **Triton**
+    1. Remove fixed remove bad_words / stop words from `/generate` call - [Get Started](../../docs/providers/triton-inference-server#triton-generate---chat-completion), [PR](https://github.com/BerriAI/litellm/pull/10163)
+- **Other**
+    1. Support for all litellm providers on Responses API (works with Codex) - [Get Started](../../docs/tutorials/openai_codex), [PR](https://github.com/BerriAI/litellm/pull/10132)
+    2. Fix combining multiple tool calls in streaming response - [Get Started](../../docs/completion/stream#helper-function), [PR](https://github.com/BerriAI/litellm/pull/10040)
+
+
+## Spend Tracking Improvements
+
+- **Cost Control** - inject cache control points in prompt for cost reduction [Get Started](../../docs/tutorials/prompt_caching), [PR](https://github.com/BerriAI/litellm/pull/10000)
+- **Spend Tags** - spend tags in headers - support x-litellm-tags even if tag based routing not enabled [Get Started](../../docs/proxy/request_headers#litellm-headers), [PR](https://github.com/BerriAI/litellm/pull/10000)
+- **Gemini-2.5-flash** - support cost calculation for reasoning tokens [PR](https://github.com/BerriAI/litellm/pull/10141)
+
+## Management Endpoints / UI
+- **Users**
+    1. Show created_at and updated_at on users page - [PR](https://github.com/BerriAI/litellm/pull/10033)
+- **Virtual Keys**
+    1. Filter by key alias - https://github.com/BerriAI/litellm/pull/10085
+- **Usage Tab**
+
+    1. Team based usage
+        
+        - New `LiteLLM_DailyTeamSpend` Table for aggregate team based usage logging - [PR](https://github.com/BerriAI/litellm/pull/10039)
+        
+        - New Team based usage dashboard + new `/team/daily/activity` API - [PR](https://github.com/BerriAI/litellm/pull/10081)
+        - Return team alias on /team/daily/activity API - [PR](https://github.com/BerriAI/litellm/pull/10157)
+        - allow internal user view spend for teams they belong to - [PR](https://github.com/BerriAI/litellm/pull/10157)
+        - allow viewing top keys by team - [PR](https://github.com/BerriAI/litellm/pull/10157)
+
+        <Image img={require('../../img/release_notes/new_team_usage.png')}/>
+
+    2. Tag Based Usage
+        - New `LiteLLM_DailyTagSpend` Table for aggregate tag based usage logging - [PR](https://github.com/BerriAI/litellm/pull/10071)
+        - Restrict to only Proxy Admins - [PR](https://github.com/BerriAI/litellm/pull/10157)
+        - allow viewing top keys by tag
+        - Return tags passed in request (i.e. dynamic tags) on `/tag/list` API - [PR](https://github.com/BerriAI/litellm/pull/10157)
+        <Image img={require('../../img/release_notes/new_tag_usage.png')}/>
+    3. Track prompt caching metrics in daily user, team, tag tables - [PR](https://github.com/BerriAI/litellm/pull/10029)
+    4. Show usage by key (on all up, team, and tag usage dashboards) - [PR](https://github.com/BerriAI/litellm/pull/10157)
+    5. swap old usage with new usage tab
+- **Models**
+    1. Make columns resizable/hideable - [PR](https://github.com/BerriAI/litellm/pull/10119)
+- **API Playground**
+    1. Allow internal user to call api playground - [PR](https://github.com/BerriAI/litellm/pull/10157)
+- **SCIM**
+    1. Add LiteLLM SCIM Integration for Team and User management - [Get Started](../../docs/tutorials/scim_litellm), [PR](https://github.com/BerriAI/litellm/pull/10072)
+
+
+## Logging / Guardrail Integrations
+- **GCS**
+    1. Fix gcs pub sub logging with env var GCS_PROJECT_ID - [Get Started](../../docs/observability/gcs_bucket_integration#usage), [PR](https://github.com/BerriAI/litellm/pull/10042)
+- **AIM**
+    1. Add litellm call id passing to Aim guardrails on pre and post-hooks calls - [Get Started](../../docs/proxy/guardrails/aim_security), [PR](https://github.com/BerriAI/litellm/pull/10021)
+- **Azure blob storage**
+    1. Ensure logging works in high throughput scenarios - [Get Started](../../docs/proxy/logging#azure-blob-storage), [PR](https://github.com/BerriAI/litellm/pull/9962)
+
+## General Proxy Improvements
+
+- **Support setting `litellm.modify_params` via env var** [PR](https://github.com/BerriAI/litellm/pull/9964)
+- **Model Discovery** - Check provider’s `/models` endpoints when calling proxy’s `/v1/models` endpoint - [Get Started](../../docs/proxy/model_discovery), [PR](https://github.com/BerriAI/litellm/pull/9958)
+- **`/utils/token_counter`** - fix retrieving custom tokenizer for db models - [Get Started](../../docs/proxy/configs#set-custom-tokenizer), [PR](https://github.com/BerriAI/litellm/pull/10047)
+- **Prisma migrate** - handle existing columns in db table - [PR](https://github.com/BerriAI/litellm/pull/10138)
+
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -69,6 +69,7 @@ const sidebars = {
            "proxy/clientside_auth",
            "proxy/request_headers",
            "proxy/response_headers",
+            "proxy/model_discovery",
          ],
        },
        {
@ -101,6 +102,7 @@ const sidebars = {
            "proxy/admin_ui_sso",
            "proxy/self_serve",
            "proxy/public_teams",
+            "tutorials/scim_litellm",
            "proxy/custom_sso",
            "proxy/ui_credentials",
            "proxy/ui_logs"
@ -188,7 +190,7 @@ const sidebars = {
        "providers/azure_ai",
        "providers/aiml",
        "providers/vertex",
-        
+
        {
          type: "category",
          label: "Google AI Studio",
@ -330,6 +332,8 @@ const sidebars = {
            "pass_through/vertex_ai",
            "pass_through/google_ai_studio",
            "pass_through/cohere",
+            "pass_through/vllm",
+            "pass_through/mistral",
            "pass_through/openai_passthrough",
            "pass_through/anthropic_completion",
            "pass_through/bedrock",
@ -340,7 +344,15 @@ const sidebars = {
        },
        "rerank",
        "assistants",
-        "files_endpoints",
+
+        {
+          type: "category",
+          label: "/files",
+          items: [
+            "files_endpoints",
+            "proxy/litellm_managed_files",
+          ],
+        },
        "batches",
        "realtime",
        "fine_tuning",
@ -399,9 +411,9 @@ const sidebars = {
      type: "category",
      label: "Logging & Observability",
      items: [
+        "observability/langfuse_integration",
        "observability/lunary_integration",
        "observability/mlflow",
-        "observability/langfuse_integration",
        "observability/gcs_bucket_integration",
        "observability/langsmith_integration",
        "observability/literalai_integration",
@ -435,7 +447,10 @@ const sidebars = {
      label: "Tutorials",
      items: [
        "tutorials/openweb_ui",
+        "tutorials/openai_codex",
        "tutorials/msft_sso",
+        "tutorials/prompt_caching",
+        "tutorials/tag_management",
        'tutorials/litellm_proxy_aporia',
        {
          type: "category",
--- a/litellm-proxy-extras/dist/litellm_proxy_extras-0.1.4-py3-none-any.whl
+++ b/litellm-proxy-extras/dist/litellm_proxy_extras-0.1.4-py3-none-any.whl
--- a/litellm-proxy-extras/dist/litellm_proxy_extras-0.1.4.tar.gz
+++ b/litellm-proxy-extras/dist/litellm_proxy_extras-0.1.4.tar.gz
--- a/litellm-proxy-extras/dist/litellm_proxy_extras-0.1.7-py3-none-any.whl
+++ b/litellm-proxy-extras/dist/litellm_proxy_extras-0.1.7-py3-none-any.whl
--- a/litellm-proxy-extras/dist/litellm_proxy_extras-0.1.7.tar.gz
+++ b/litellm-proxy-extras/dist/litellm_proxy_extras-0.1.7.tar.gz
--- a/litellm-proxy-extras/dist/litellm_proxy_extras-0.1.8-py3-none-any.whl
+++ b/litellm-proxy-extras/dist/litellm_proxy_extras-0.1.8-py3-none-any.whl
--- a/litellm-proxy-extras/dist/litellm_proxy_extras-0.1.8.tar.gz
+++ b/litellm-proxy-extras/dist/litellm_proxy_extras-0.1.8.tar.gz
--- a/litellm-proxy-extras/litellm_proxy_extras/migrations/20250411215431_add_managed_file_table/migration.sql
+++ b/litellm-proxy-extras/litellm_proxy_extras/migrations/20250411215431_add_managed_file_table/migration.sql
@ -0,0 +1,18 @@
+-- CreateTable
+CREATE TABLE "LiteLLM_ManagedFileTable" (
+    "id" TEXT NOT NULL,
+    "unified_file_id" TEXT NOT NULL,
+    "file_object" JSONB NOT NULL,
+    "model_mappings" JSONB NOT NULL,
+    "created_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    "updated_at" TIMESTAMP(3) NOT NULL,
+
+    CONSTRAINT "LiteLLM_ManagedFileTable_pkey" PRIMARY KEY ("id")
+);
+
+-- CreateIndex
+CREATE UNIQUE INDEX "LiteLLM_ManagedFileTable_unified_file_id_key" ON "LiteLLM_ManagedFileTable"("unified_file_id");
+
+-- CreateIndex
+CREATE INDEX "LiteLLM_ManagedFileTable_unified_file_id_idx" ON "LiteLLM_ManagedFileTable"("unified_file_id");
+
--- a/litellm-proxy-extras/litellm_proxy_extras/migrations/20250412081753_team_member_permissions/migration.sql
+++ b/litellm-proxy-extras/litellm_proxy_extras/migrations/20250412081753_team_member_permissions/migration.sql
@ -0,0 +1,3 @@
+-- AlterTable
+ALTER TABLE "LiteLLM_TeamTable" ADD COLUMN     "team_member_permissions" TEXT[] DEFAULT ARRAY[]::TEXT[];
+
--- a/litellm-proxy-extras/litellm_proxy_extras/migrations/20250415151647_add_cache_read_write_tokens_daily_spend_transactions/migration.sql
+++ b/litellm-proxy-extras/litellm_proxy_extras/migrations/20250415151647_add_cache_read_write_tokens_daily_spend_transactions/migration.sql
@ -0,0 +1,4 @@
+-- AlterTable
+ALTER TABLE "LiteLLM_DailyUserSpend" ADD COLUMN     "cache_creation_input_tokens" INTEGER NOT NULL DEFAULT 0,
+ADD COLUMN     "cache_read_input_tokens" INTEGER NOT NULL DEFAULT 0;
+
--- a/litellm-proxy-extras/litellm_proxy_extras/migrations/20250415191926_add_daily_team_table/migration.sql
+++ b/litellm-proxy-extras/litellm_proxy_extras/migrations/20250415191926_add_daily_team_table/migration.sql
@ -0,0 +1,36 @@
+-- CreateTable
+CREATE TABLE "LiteLLM_DailyTeamSpend" (
+    "id" TEXT NOT NULL,
+    "team_id" TEXT NOT NULL,
+    "date" TEXT NOT NULL,
+    "api_key" TEXT NOT NULL,
+    "model" TEXT NOT NULL,
+    "model_group" TEXT,
+    "custom_llm_provider" TEXT,
+    "prompt_tokens" INTEGER NOT NULL DEFAULT 0,
+    "completion_tokens" INTEGER NOT NULL DEFAULT 0,
+    "spend" DOUBLE PRECISION NOT NULL DEFAULT 0.0,
+    "api_requests" INTEGER NOT NULL DEFAULT 0,
+    "successful_requests" INTEGER NOT NULL DEFAULT 0,
+    "failed_requests" INTEGER NOT NULL DEFAULT 0,
+    "created_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    "updated_at" TIMESTAMP(3) NOT NULL,
+
+    CONSTRAINT "LiteLLM_DailyTeamSpend_pkey" PRIMARY KEY ("id")
+);
+
+-- CreateIndex
+CREATE INDEX "LiteLLM_DailyTeamSpend_date_idx" ON "LiteLLM_DailyTeamSpend"("date");
+
+-- CreateIndex
+CREATE INDEX "LiteLLM_DailyTeamSpend_team_id_idx" ON "LiteLLM_DailyTeamSpend"("team_id");
+
+-- CreateIndex
+CREATE INDEX "LiteLLM_DailyTeamSpend_api_key_idx" ON "LiteLLM_DailyTeamSpend"("api_key");
+
+-- CreateIndex
+CREATE INDEX "LiteLLM_DailyTeamSpend_model_idx" ON "LiteLLM_DailyTeamSpend"("model");
+
+-- CreateIndex
+CREATE UNIQUE INDEX "LiteLLM_DailyTeamSpend_team_id_date_api_key_model_custom_ll_key" ON "LiteLLM_DailyTeamSpend"("team_id", "date", "api_key", "model", "custom_llm_provider");
+
--- a/litellm-proxy-extras/litellm_proxy_extras/migrations/20250416115320_add_tag_table_to_db/migration.sql
+++ b/litellm-proxy-extras/litellm_proxy_extras/migrations/20250416115320_add_tag_table_to_db/migration.sql
@ -0,0 +1,45 @@
+-- AlterTable
+ALTER TABLE "LiteLLM_DailyTeamSpend" ADD COLUMN     "cache_creation_input_tokens" INTEGER NOT NULL DEFAULT 0,
+ADD COLUMN     "cache_read_input_tokens" INTEGER NOT NULL DEFAULT 0;
+
+-- CreateTable
+CREATE TABLE "LiteLLM_DailyTagSpend" (
+    "id" TEXT NOT NULL,
+    "tag" TEXT NOT NULL,
+    "date" TEXT NOT NULL,
+    "api_key" TEXT NOT NULL,
+    "model" TEXT NOT NULL,
+    "model_group" TEXT,
+    "custom_llm_provider" TEXT,
+    "prompt_tokens" INTEGER NOT NULL DEFAULT 0,
+    "completion_tokens" INTEGER NOT NULL DEFAULT 0,
+    "cache_read_input_tokens" INTEGER NOT NULL DEFAULT 0,
+    "cache_creation_input_tokens" INTEGER NOT NULL DEFAULT 0,
+    "spend" DOUBLE PRECISION NOT NULL DEFAULT 0.0,
+    "api_requests" INTEGER NOT NULL DEFAULT 0,
+    "successful_requests" INTEGER NOT NULL DEFAULT 0,
+    "failed_requests" INTEGER NOT NULL DEFAULT 0,
+    "created_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    "updated_at" TIMESTAMP(3) NOT NULL,
+
+    CONSTRAINT "LiteLLM_DailyTagSpend_pkey" PRIMARY KEY ("id")
+);
+
+-- CreateIndex
+CREATE UNIQUE INDEX "LiteLLM_DailyTagSpend_tag_key" ON "LiteLLM_DailyTagSpend"("tag");
+
+-- CreateIndex
+CREATE INDEX "LiteLLM_DailyTagSpend_date_idx" ON "LiteLLM_DailyTagSpend"("date");
+
+-- CreateIndex
+CREATE INDEX "LiteLLM_DailyTagSpend_tag_idx" ON "LiteLLM_DailyTagSpend"("tag");
+
+-- CreateIndex
+CREATE INDEX "LiteLLM_DailyTagSpend_api_key_idx" ON "LiteLLM_DailyTagSpend"("api_key");
+
+-- CreateIndex
+CREATE INDEX "LiteLLM_DailyTagSpend_model_idx" ON "LiteLLM_DailyTagSpend"("model");
+
+-- CreateIndex
+CREATE UNIQUE INDEX "LiteLLM_DailyTagSpend_tag_date_api_key_model_custom_llm_pro_key" ON "LiteLLM_DailyTagSpend"("tag", "date", "api_key", "model", "custom_llm_provider");
+
--- a/litellm-proxy-extras/litellm_proxy_extras/migrations/20250416151339_drop_tag_uniqueness_requirement/migration.sql
+++ b/litellm-proxy-extras/litellm_proxy_extras/migrations/20250416151339_drop_tag_uniqueness_requirement/migration.sql
@ -0,0 +1,3 @@
+-- DropIndex
+DROP INDEX "LiteLLM_DailyTagSpend_tag_key";
+
--- a/litellm-proxy-extras/litellm_proxy_extras/migrations/20250416185146_add_allowed_routes_litellm_verification_token/migration.sql
+++ b/litellm-proxy-extras/litellm_proxy_extras/migrations/20250416185146_add_allowed_routes_litellm_verification_token/migration.sql
@ -0,0 +1,3 @@
+-- AlterTable
+ALTER TABLE "LiteLLM_VerificationToken" ADD COLUMN     "allowed_routes" TEXT[] DEFAULT ARRAY[]::TEXT[];
+
--- a/litellm-proxy-extras/litellm_proxy_extras/schema.prisma
+++ b/litellm-proxy-extras/litellm_proxy_extras/schema.prisma
@ -106,6 +106,7 @@ model LiteLLM_TeamTable {
    updated_at    DateTime               @default(now()) @updatedAt @map("updated_at")
    model_spend      Json @default("{}")
    model_max_budget Json @default("{}")
+    team_member_permissions String[] @default([])
    model_id Int? @unique // id for LiteLLM_ModelTable -> stores team-level model aliases
    litellm_organization_table LiteLLM_OrganizationTable?   @relation(fields: [organization_id], references: [organization_id])
    litellm_model_table LiteLLM_ModelTable? @relation(fields: [model_id], references: [id])
@ -168,6 +169,7 @@ model LiteLLM_VerificationToken {
    budget_duration String? 
    budget_reset_at DateTime?
    allowed_cache_controls String[] @default([])
+    allowed_routes   String[] @default([])
    model_spend      Json @default("{}")
    model_max_budget Json @default("{}")
    budget_id String?
@ -325,6 +327,8 @@ model LiteLLM_DailyUserSpend {
  custom_llm_provider String?  
  prompt_tokens       Int      @default(0)
  completion_tokens   Int      @default(0)
+  cache_read_input_tokens     Int      @default(0)
+  cache_creation_input_tokens Int      @default(0)
  spend               Float    @default(0.0)
  api_requests        Int      @default(0)
  successful_requests Int      @default(0)
@ -339,6 +343,60 @@ model LiteLLM_DailyUserSpend {
  @@index([model])
 }

+// Track daily team spend metrics per model and key
+model LiteLLM_DailyTeamSpend {
+  id                  String   @id @default(uuid())
+  team_id             String
+  date                String
+  api_key             String   
+  model               String   
+  model_group         String?  
+  custom_llm_provider String?  
+  prompt_tokens       Int      @default(0)
+  completion_tokens   Int      @default(0)
+  cache_read_input_tokens     Int      @default(0)
+  cache_creation_input_tokens Int      @default(0)
+  spend               Float    @default(0.0)
+  api_requests        Int      @default(0)
+  successful_requests Int      @default(0)
+  failed_requests     Int      @default(0)
+  created_at          DateTime @default(now())
+  updated_at          DateTime @updatedAt
+
+  @@unique([team_id, date, api_key, model, custom_llm_provider])
+  @@index([date])
+  @@index([team_id])
+  @@index([api_key])
+  @@index([model])
+}
+
+// Track daily team spend metrics per model and key
+model LiteLLM_DailyTagSpend {
+  id                  String   @id @default(uuid())
+  tag                 String   
+  date                String
+  api_key             String   
+  model               String   
+  model_group         String?  
+  custom_llm_provider String?  
+  prompt_tokens       Int      @default(0)
+  completion_tokens   Int      @default(0)
+  cache_read_input_tokens     Int      @default(0)
+  cache_creation_input_tokens Int      @default(0)
+  spend               Float    @default(0.0)
+  api_requests        Int      @default(0)
+  successful_requests Int      @default(0)
+  failed_requests     Int      @default(0)
+  created_at          DateTime @default(now())
+  updated_at          DateTime @updatedAt
+
+  @@unique([tag, date, api_key, model, custom_llm_provider])
+  @@index([date])
+  @@index([tag])
+  @@index([api_key])
+  @@index([model])
+}
+

 // Track the status of cron jobs running. Only allow one pod to run the job at a time
 model LiteLLM_CronJob {
@ -354,3 +412,14 @@ enum JobStatus {
  INACTIVE
 }

+model LiteLLM_ManagedFileTable {
+  id String @id @default(uuid())
+  unified_file_id String @unique // The base64 encoded unified file ID
+  file_object Json // Stores the OpenAIFileObject
+  model_mappings Json // Stores the mapping of model_id -> provider_file_id
+  created_at DateTime @default(now())
+  updated_at DateTime @updatedAt
+
+  @@index([unified_file_id])
+}
+
--- a/litellm-proxy-extras/litellm_proxy_extras/utils.py
+++ b/litellm-proxy-extras/litellm_proxy_extras/utils.py
@ -1,7 +1,10 @@
+import glob
 import os
 import random
+import re
 import subprocess
 import time
+from pathlib import Path
 from typing import Optional

 from litellm_proxy_extras._logging import logger
@ -14,6 +17,114 @@ def str_to_bool(value: Optional[str]) -> bool:


 class ProxyExtrasDBManager:
+    @staticmethod
+    def _get_prisma_dir() -> str:
+        """Get the path to the migrations directory"""
+        migrations_dir = os.path.dirname(__file__)
+        return migrations_dir
+
+    @staticmethod
+    def _create_baseline_migration(schema_path: str) -> bool:
+        """Create a baseline migration for an existing database"""
+        prisma_dir = ProxyExtrasDBManager._get_prisma_dir()
+        prisma_dir_path = Path(prisma_dir)
+        init_dir = prisma_dir_path / "migrations" / "0_init"
+
+        # Create migrations/0_init directory
+        init_dir.mkdir(parents=True, exist_ok=True)
+
+        # Generate migration SQL file
+        migration_file = init_dir / "migration.sql"
+
+        try:
+            # Generate migration diff with increased timeout
+            subprocess.run(
+                [
+                    "prisma",
+                    "migrate",
+                    "diff",
+                    "--from-empty",
+                    "--to-schema-datamodel",
+                    str(schema_path),
+                    "--script",
+                ],
+                stdout=open(migration_file, "w"),
+                check=True,
+                timeout=30,
+            )  # 30 second timeout
+
+            # Mark migration as applied with increased timeout
+            subprocess.run(
+                [
+                    "prisma",
+                    "migrate",
+                    "resolve",
+                    "--applied",
+                    "0_init",
+                ],
+                check=True,
+                timeout=30,
+            )
+
+            return True
+        except subprocess.TimeoutExpired:
+            logger.warning(
+                "Migration timed out - the database might be under heavy load."
+            )
+            return False
+        except subprocess.CalledProcessError as e:
+            logger.warning(f"Error creating baseline migration: {e}")
+            return False
+
+    @staticmethod
+    def _get_migration_names(migrations_dir: str) -> list:
+        """Get all migration directory names from the migrations folder"""
+        migration_paths = glob.glob(f"{migrations_dir}/migrations/*/migration.sql")
+        logger.info(f"Found {len(migration_paths)} migrations at {migrations_dir}")
+        return [Path(p).parent.name for p in migration_paths]
+
+    @staticmethod
+    def _roll_back_migration(migration_name: str):
+        """Mark a specific migration as rolled back"""
+        subprocess.run(
+            ["prisma", "migrate", "resolve", "--rolled-back", migration_name],
+            timeout=60,
+            check=True,
+            capture_output=True,
+        )
+
+    @staticmethod
+    def _resolve_specific_migration(migration_name: str):
+        """Mark a specific migration as applied"""
+        subprocess.run(
+            ["prisma", "migrate", "resolve", "--applied", migration_name],
+            timeout=60,
+            check=True,
+            capture_output=True,
+        )
+
+    @staticmethod
+    def _resolve_all_migrations(migrations_dir: str):
+        """Mark all existing migrations as applied"""
+        migration_names = ProxyExtrasDBManager._get_migration_names(migrations_dir)
+        logger.info(f"Resolving {len(migration_names)} migrations")
+        for migration_name in migration_names:
+            try:
+                logger.info(f"Resolving migration: {migration_name}")
+                subprocess.run(
+                    ["prisma", "migrate", "resolve", "--applied", migration_name],
+                    timeout=60,
+                    check=True,
+                    capture_output=True,
+                    text=True,
+                )
+                logger.debug(f"Resolved migration: {migration_name}")
+            except subprocess.CalledProcessError as e:
+                if "is already recorded as applied in the database." not in e.stderr:
+                    logger.warning(
+                        f"Failed to resolve migration {migration_name}: {e.stderr}"
+                    )
+
    @staticmethod
    def setup_database(schema_path: str, use_migrate: bool = False) -> bool:
        """
@ -30,7 +141,7 @@ class ProxyExtrasDBManager:
        use_migrate = str_to_bool(os.getenv("USE_PRISMA_MIGRATE")) or use_migrate
        for attempt in range(4):
            original_dir = os.getcwd()
-            migrations_dir = os.path.dirname(__file__)
+            migrations_dir = ProxyExtrasDBManager._get_prisma_dir()
            os.chdir(migrations_dir)

            try:
@ -51,12 +162,70 @@ class ProxyExtrasDBManager:
                        return True
                    except subprocess.CalledProcessError as e:
                        logger.info(f"prisma db error: {e.stderr}, e: {e.stdout}")
-                        if (
+                        if "P3009" in e.stderr:
+                            # Extract the failed migration name from the error message
+                            migration_match = re.search(
+                                r"`(\d+_.*)` migration", e.stderr
+                            )
+                            if migration_match:
+                                failed_migration = migration_match.group(1)
+                                logger.info(
+                                    f"Found failed migration: {failed_migration}, marking as rolled back"
+                                )
+                                # Mark the failed migration as rolled back
+                                subprocess.run(
+                                    [
+                                        "prisma",
+                                        "migrate",
+                                        "resolve",
+                                        "--rolled-back",
+                                        failed_migration,
+                                    ],
+                                    timeout=60,
+                                    check=True,
+                                    capture_output=True,
+                                    text=True,
+                                )
+                                logger.info(
+                                    f"✅ Migration {failed_migration} marked as rolled back... retrying"
+                                )
+                        elif (
                            "P3005" in e.stderr
                            and "database schema is not empty" in e.stderr
                        ):
-                            logger.info("Error: Database schema is not empty")
-                            return False
+                            logger.info(
+                                "Database schema is not empty, creating baseline migration"
+                            )
+                            ProxyExtrasDBManager._create_baseline_migration(schema_path)
+                            logger.info(
+                                "Baseline migration created, resolving all migrations"
+                            )
+                            ProxyExtrasDBManager._resolve_all_migrations(migrations_dir)
+                            logger.info("✅ All migrations resolved.")
+                            return True
+                        elif (
+                            "P3018" in e.stderr
+                        ):  # PostgreSQL error code for duplicate column
+                            logger.info(
+                                "Migration already exists, resolving specific migration"
+                            )
+                            # Extract the migration name from the error message
+                            migration_match = re.search(
+                                r"Migration name: (\d+_.*)", e.stderr
+                            )
+                            if migration_match:
+                                migration_name = migration_match.group(1)
+                                logger.info(f"Rolling back migration {migration_name}")
+                                ProxyExtrasDBManager._roll_back_migration(
+                                    migration_name
+                                )
+                                logger.info(
+                                    f"Resolving migration {migration_name} that failed due to existing columns"
+                                )
+                                ProxyExtrasDBManager._resolve_specific_migration(
+                                    migration_name
+                                )
+                                logger.info("✅ Migration resolved.")
                else:
                    # Use prisma db push with increased timeout
                    subprocess.run(
--- a/litellm-proxy-extras/poetry.lock
+++ b/litellm-proxy-extras/poetry.lock
@ -1,7 +1,7 @@
-# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand.
 package = []

 [metadata]
-lock-version = "2.0"
+lock-version = "2.1"
 python-versions = ">=3.8.1,<4.0, !=3.9.7"
 content-hash = "2cf39473e67ff0615f0a61c9d2ac9f02b38cc08cbb1bdb893d89bee002646623"
--- a/litellm-proxy-extras/pyproject.toml
+++ b/litellm-proxy-extras/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm-proxy-extras"
-version = "0.1.3"
+version = "0.1.11"
 description = "Additional files for the LiteLLM Proxy. Reduces the size of the main litellm package."
 authors = ["BerriAI"]
 readme = "README.md"
@ -22,7 +22,7 @@ requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"

 [tool.commitizen]
-version = "0.1.3"
+version = "0.1.11"
 version_files = [
    "pyproject.toml:version",
    "../requirements.txt:litellm-proxy-extras==",
--- a/litellm/init.py
+++ b/litellm/init.py
@ -113,6 +113,7 @@ _custom_logger_compatible_callbacks_literal = Literal[
    "pagerduty",
    "humanloop",
    "gcs_pubsub",
+    "anthropic_cache_control_hook",
 ]
 logged_real_time_event_types: Optional[Union[List[str], Literal["*"]]] = None
 _known_custom_logger_compatible_callbacks: List = list(
@ -162,7 +163,7 @@ token: Optional[str] = (
 telemetry = True
 max_tokens: int = DEFAULT_MAX_TOKENS  # OpenAI Defaults
 drop_params = bool(os.getenv("LITELLM_DROP_PARAMS", False))
-modify_params = False
+modify_params = bool(os.getenv("LITELLM_MODIFY_PARAMS", False))
 retry = True
 ### AUTH ###
 api_key: Optional[str] = None
@ -324,6 +325,7 @@ from litellm.litellm_core_utils.get_model_cost_map import get_model_cost_map

 model_cost = get_model_cost_map(url=model_cost_map_url)
 custom_prompt_dict: Dict[str, dict] = {}
+check_provider_endpoint = False


 ####### THREAD-SPECIFIC DATA ####################
@ -947,6 +949,7 @@ from .llms.voyage.embedding.transformation import VoyageEmbeddingConfig
 from .llms.azure_ai.chat.transformation import AzureAIStudioConfig
 from .llms.mistral.mistral_chat_transformation import MistralConfig
 from .llms.openai.responses.transformation import OpenAIResponsesAPIConfig
+from .llms.azure.responses.transformation import AzureOpenAIResponsesAPIConfig
 from .llms.openai.chat.o_series_transformation import (
    OpenAIOSeriesConfig as OpenAIO1Config,  # maintain backwards compatibility
    OpenAIOSeriesConfig,
--- a/litellm/assistants/main.py
+++ b/litellm/assistants/main.py
@ -304,6 +304,11 @@ def create_assistants(
        "response_format": response_format,
    }

+    # only send params that are not None
+    create_assistant_data = {
+        k: v for k, v in create_assistant_data.items() if v is not None
+    }
+
    response: Optional[Union[Coroutine[Any, Any, Assistant], Assistant]] = None
    if custom_llm_provider == "openai":
        api_base = (
--- a/litellm/constants.py
+++ b/litellm/constants.py
@ -21,9 +21,18 @@ DEFAULT_MAX_TOKENS = 256  # used when providers need a default
 MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 1024  # 1MB = 1024KB
 SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000  # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic.

+DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET = 1024
+DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET = 2048
+DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET = 4096
+
+########## Networking constants ##############################################################
+_DEFAULT_TTL_FOR_HTTPX_CLIENTS = 3600  # 1 hour, re-use the same httpx client for 1 hour
+
 ########### v2 Architecture constants for managing writing updates to the database ###########
 REDIS_UPDATE_BUFFER_KEY = "litellm_spend_update_buffer"
 REDIS_DAILY_SPEND_UPDATE_BUFFER_KEY = "litellm_daily_spend_update_buffer"
+REDIS_DAILY_TEAM_SPEND_UPDATE_BUFFER_KEY = "litellm_daily_team_spend_update_buffer"
+REDIS_DAILY_TAG_SPEND_UPDATE_BUFFER_KEY = "litellm_daily_tag_spend_update_buffer"
 MAX_REDIS_BUFFER_DEQUEUE_COUNT = 100
 MAX_SIZE_IN_MEMORY_QUEUE = 10000
 MAX_IN_MEMORY_QUEUE_FLUSH_COUNT = 1000
--- a/litellm/integrations/anthropic_cache_control_hook.py
+++ b/litellm/integrations/anthropic_cache_control_hook.py
@ -0,0 +1,150 @@
+"""
+This hook is used to inject cache control directives into the messages of a chat completion.
+
+Users can define
+- `cache_control_injection_points` in the completion params and litellm will inject the cache control directives into the messages at the specified injection points.
+
+"""
+
+import copy
+from typing import Dict, List, Optional, Tuple, Union, cast
+
+from litellm.integrations.custom_logger import CustomLogger
+from litellm.integrations.custom_prompt_management import CustomPromptManagement
+from litellm.types.integrations.anthropic_cache_control_hook import (
+    CacheControlInjectionPoint,
+    CacheControlMessageInjectionPoint,
+)
+from litellm.types.llms.openai import AllMessageValues, ChatCompletionCachedContent
+from litellm.types.utils import StandardCallbackDynamicParams
+
+
+class AnthropicCacheControlHook(CustomPromptManagement):
+    def get_chat_completion_prompt(
+        self,
+        model: str,
+        messages: List[AllMessageValues],
+        non_default_params: dict,
+        prompt_id: Optional[str],
+        prompt_variables: Optional[dict],
+        dynamic_callback_params: StandardCallbackDynamicParams,
+    ) -> Tuple[str, List[AllMessageValues], dict]:
+        """
+        Apply cache control directives based on specified injection points.
+
+        Returns:
+        - model: str - the model to use
+        - messages: List[AllMessageValues] - messages with applied cache controls
+        - non_default_params: dict - params with any global cache controls
+        """
+        # Extract cache control injection points
+        injection_points: List[CacheControlInjectionPoint] = non_default_params.pop(
+            "cache_control_injection_points", []
+        )
+        if not injection_points:
+            return model, messages, non_default_params
+
+        # Create a deep copy of messages to avoid modifying the original list
+        processed_messages = copy.deepcopy(messages)
+
+        # Process message-level cache controls
+        for point in injection_points:
+            if point.get("location") == "message":
+                point = cast(CacheControlMessageInjectionPoint, point)
+                processed_messages = self._process_message_injection(
+                    point=point, messages=processed_messages
+                )
+
+        return model, processed_messages, non_default_params
+
+    @staticmethod
+    def _process_message_injection(
+        point: CacheControlMessageInjectionPoint, messages: List[AllMessageValues]
+    ) -> List[AllMessageValues]:
+        """Process message-level cache control injection."""
+        control: ChatCompletionCachedContent = point.get(
+            "control", None
+        ) or ChatCompletionCachedContent(type="ephemeral")
+
+        _targetted_index: Optional[Union[int, str]] = point.get("index", None)
+        targetted_index: Optional[int] = None
+        if isinstance(_targetted_index, str):
+            if _targetted_index.isdigit():
+                targetted_index = int(_targetted_index)
+        else:
+            targetted_index = _targetted_index
+
+        targetted_role = point.get("role", None)
+
+        # Case 1: Target by specific index
+        if targetted_index is not None:
+            if 0 <= targetted_index < len(messages):
+                messages[targetted_index] = (
+                    AnthropicCacheControlHook._safe_insert_cache_control_in_message(
+                        messages[targetted_index], control
+                    )
+                )
+        # Case 2: Target by role
+        elif targetted_role is not None:
+            for msg in messages:
+                if msg.get("role") == targetted_role:
+                    msg = (
+                        AnthropicCacheControlHook._safe_insert_cache_control_in_message(
+                            message=msg, control=control
+                        )
+                    )
+        return messages
+
+    @staticmethod
+    def _safe_insert_cache_control_in_message(
+        message: AllMessageValues, control: ChatCompletionCachedContent
+    ) -> AllMessageValues:
+        """
+        Safe way to insert cache control in a message
+
+        OpenAI Message content can be either:
+            - string
+            - list of objects
+
+        This method handles inserting cache control in both cases.
+        """
+        message_content = message.get("content", None)
+
+        # 1. if string, insert cache control in the message
+        if isinstance(message_content, str):
+            message["cache_control"] = control  # type: ignore
+        # 2. list of objects
+        elif isinstance(message_content, list):
+            for content_item in message_content:
+                if isinstance(content_item, dict):
+                    content_item["cache_control"] = control  # type: ignore
+        return message
+
+    @property
+    def integration_name(self) -> str:
+        """Return the integration name for this hook."""
+        return "anthropic_cache_control_hook"
+
+    @staticmethod
+    def should_use_anthropic_cache_control_hook(non_default_params: Dict) -> bool:
+        if non_default_params.get("cache_control_injection_points", None):
+            return True
+        return False
+
+    @staticmethod
+    def get_custom_logger_for_anthropic_cache_control_hook(
+        non_default_params: Dict,
+    ) -> Optional[CustomLogger]:
+        from litellm.litellm_core_utils.litellm_logging import (
+            _init_custom_logger_compatible_class,
+        )
+
+        if AnthropicCacheControlHook.should_use_anthropic_cache_control_hook(
+            non_default_params
+        ):
+            return _init_custom_logger_compatible_class(
+                logging_integration="anthropic_cache_control_hook",
+                internal_usage_cache=None,
+                llm_router=None,
+            )
+        return None
--- a/litellm/integrations/azure_storage/azure_storage.py
+++ b/litellm/integrations/azure_storage/azure_storage.py
@ -1,14 +1,15 @@
 import asyncio
 import json
 import os
+import time
 import uuid
 from datetime import datetime, timedelta
 from typing import List, Optional

 from litellm._logging import verbose_logger
-from litellm.constants import AZURE_STORAGE_MSFT_VERSION
+from litellm.constants import _DEFAULT_TTL_FOR_HTTPX_CLIENTS, AZURE_STORAGE_MSFT_VERSION
 from litellm.integrations.custom_batch_logger import CustomBatchLogger
-from litellm.llms.azure.common_utils import get_azure_ad_token_from_entrata_id
+from litellm.llms.azure.common_utils import get_azure_ad_token_from_entra_id
 from litellm.llms.custom_httpx.http_handler import (
    AsyncHTTPHandler,
    get_async_httpx_client,
@ -48,14 +49,17 @@ class AzureBlobStorageLogger(CustomBatchLogger):
                    "Missing required environment variable: AZURE_STORAGE_FILE_SYSTEM"
                )
            self.azure_storage_file_system: str = _azure_storage_file_system
+            self._service_client = None
+            # Time that the azure service client expires, in order to reset the connection pool and keep it fresh
+            self._service_client_timeout: Optional[float] = None

            # Internal variables used for Token based authentication
-            self.azure_auth_token: Optional[
-                str
-            ] = None  # the Azure AD token to use for Azure Storage API requests
-            self.token_expiry: Optional[
-                datetime
-            ] = None  # the expiry time of the currentAzure AD token
+            self.azure_auth_token: Optional[str] = (
+                None  # the Azure AD token to use for Azure Storage API requests
+            )
+            self.token_expiry: Optional[datetime] = (
+                None  # the expiry time of the currentAzure AD token
+            )

            asyncio.create_task(self.periodic_flush())
            self.flush_lock = asyncio.Lock()
@ -291,7 +295,7 @@ class AzureBlobStorageLogger(CustomBatchLogger):
                "Missing required environment variable: AZURE_STORAGE_CLIENT_SECRET"
            )

-        token_provider = get_azure_ad_token_from_entrata_id(
+        token_provider = get_azure_ad_token_from_entra_id(
            tenant_id=tenant_id,
            client_id=client_id,
            client_secret=client_secret,
@ -324,6 +328,25 @@ class AzureBlobStorageLogger(CustomBatchLogger):
                f"AzureBlobStorageLogger is only available for premium users. {CommonProxyErrors.not_premium_user}"
            )

+    async def get_service_client(self):
+        from azure.storage.filedatalake.aio import DataLakeServiceClient
+
+        # expire old clients to recover from connection issues
+        if (
+            self._service_client_timeout
+            and self._service_client
+            and self._service_client_timeout > time.time()
+        ):
+            await self._service_client.close()
+            self._service_client = None
+        if not self._service_client:
+            self._service_client = DataLakeServiceClient(
+                account_url=f"https://{self.azure_storage_account_name}.dfs.core.windows.net",
+                credential=self.azure_storage_account_key,
+            )
+            self._service_client_timeout = time.time() + _DEFAULT_TTL_FOR_HTTPX_CLIENTS
+        return self._service_client
+
    async def upload_to_azure_data_lake_with_azure_account_key(
        self, payload: StandardLoggingPayload
    ):
@ -332,13 +355,10 @@ class AzureBlobStorageLogger(CustomBatchLogger):

        This is used when Azure Storage Account Key is set - Azure Storage Account Key does not work directly with Azure Rest API
        """
-        from azure.storage.filedatalake.aio import DataLakeServiceClient

        # Create an async service client
-        service_client = DataLakeServiceClient(
-            account_url=f"https://{self.azure_storage_account_name}.dfs.core.windows.net",
-            credential=self.azure_storage_account_key,
-        )
+
+        service_client = await self.get_service_client()
        # Get file system client
        file_system_client = service_client.get_file_system_client(
            file_system=self.azure_storage_file_system
--- a/litellm/integrations/custom_logger.py
+++ b/litellm/integrations/custom_logger.py
@ -94,7 +94,7 @@ class CustomLogger:  # https://docs.litellm.ai/docs/observability/custom_callbac
        model: str,
        messages: List[AllMessageValues],
        non_default_params: dict,
-        prompt_id: str,
+        prompt_id: Optional[str],
        prompt_variables: Optional[dict],
        dynamic_callback_params: StandardCallbackDynamicParams,
    ) -> Tuple[str, List[AllMessageValues], dict]:
--- a/litellm/integrations/custom_prompt_management.py
+++ b/litellm/integrations/custom_prompt_management.py
@ -15,7 +15,7 @@ class CustomPromptManagement(CustomLogger, PromptManagementBase):
        model: str,
        messages: List[AllMessageValues],
        non_default_params: dict,
-        prompt_id: str,
+        prompt_id: Optional[str],
        prompt_variables: Optional[dict],
        dynamic_callback_params: StandardCallbackDynamicParams,
    ) -> Tuple[str, List[AllMessageValues], dict]:
--- a/litellm/integrations/gcs_pubsub/pub_sub.py
+++ b/litellm/integrations/gcs_pubsub/pub_sub.py
@ -75,7 +75,7 @@ class GcsPubSubLogger(CustomBatchLogger):
            vertex_project,
        ) = await vertex_chat_completion._ensure_access_token_async(
            credentials=self.path_service_account_json,
-            project_id=None,
+            project_id=self.project_id,
            custom_llm_provider="vertex_ai",
        )

--- a/litellm/integrations/humanloop.py
+++ b/litellm/integrations/humanloop.py
@ -152,14 +152,21 @@ class HumanloopLogger(CustomLogger):
        model: str,
        messages: List[AllMessageValues],
        non_default_params: dict,
-        prompt_id: str,
+        prompt_id: Optional[str],
        prompt_variables: Optional[dict],
        dynamic_callback_params: StandardCallbackDynamicParams,
-    ) -> Tuple[str, List[AllMessageValues], dict,]:
+    ) -> Tuple[
+        str,
+        List[AllMessageValues],
+        dict,
+    ]:
        humanloop_api_key = dynamic_callback_params.get(
            "humanloop_api_key"
        ) or get_secret_str("HUMANLOOP_API_KEY")

+        if prompt_id is None:
+            raise ValueError("prompt_id is required for Humanloop integration")
+
        if humanloop_api_key is None:
            return super().get_chat_completion_prompt(
                model=model,
--- a/litellm/integrations/langfuse/langfuse_prompt_management.py
+++ b/litellm/integrations/langfuse/langfuse_prompt_management.py
@ -169,10 +169,14 @@ class LangfusePromptManagement(LangFuseLogger, PromptManagementBase, CustomLogge
        model: str,
        messages: List[AllMessageValues],
        non_default_params: dict,
-        prompt_id: str,
+        prompt_id: Optional[str],
        prompt_variables: Optional[dict],
        dynamic_callback_params: StandardCallbackDynamicParams,
-    ) -> Tuple[str, List[AllMessageValues], dict,]:
+    ) -> Tuple[
+        str,
+        List[AllMessageValues],
+        dict,
+    ]:
        return self.get_chat_completion_prompt(
            model,
            messages,
--- a/litellm/integrations/prompt_management_base.py
+++ b/litellm/integrations/prompt_management_base.py
@ -79,10 +79,12 @@ class PromptManagementBase(ABC):
        model: str,
        messages: List[AllMessageValues],
        non_default_params: dict,
-        prompt_id: str,
+        prompt_id: Optional[str],
        prompt_variables: Optional[dict],
        dynamic_callback_params: StandardCallbackDynamicParams,
-    ) -> Tuple[str, List[AllMessageValues], dict,]:
+    ) -> Tuple[str, List[AllMessageValues], dict]:
+        if prompt_id is None:
+            raise ValueError("prompt_id is required for Prompt Management Base class")
        if not self.should_run_prompt_management(
            prompt_id=prompt_id, dynamic_callback_params=dynamic_callback_params
        ):
--- a/litellm/litellm_core_utils/get_model_cost_map.py
+++ b/litellm/litellm_core_utils/get_model_cost_map.py
@ -13,7 +13,7 @@ import os
 import httpx


-def get_model_cost_map(url: str):
+def get_model_cost_map(url: str) -> dict:
    if (
        os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False)
        or os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False) == "True"
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@ -36,6 +36,7 @@ from litellm.cost_calculator import (
    RealtimeAPITokenUsageProcessor,
    _select_model_name_for_cost_calc,
 )
+from litellm.integrations.anthropic_cache_control_hook import AnthropicCacheControlHook
 from litellm.integrations.arize.arize import ArizeLogger
 from litellm.integrations.custom_guardrail import CustomGuardrail
 from litellm.integrations.custom_logger import CustomLogger
@ -249,9 +250,9 @@ class Logging(LiteLLMLoggingBaseClass):
        self.litellm_trace_id = litellm_trace_id
        self.function_id = function_id
        self.streaming_chunks: List[Any] = []  # for generating complete stream response
-        self.sync_streaming_chunks: List[
-            Any
-        ] = []  # for generating complete stream response
+        self.sync_streaming_chunks: List[Any] = (
+            []
+        )  # for generating complete stream response
        self.log_raw_request_response = log_raw_request_response

        # Initialize dynamic callbacks
@ -455,19 +456,38 @@ class Logging(LiteLLMLoggingBaseClass):
        if "custom_llm_provider" in self.model_call_details:
            self.custom_llm_provider = self.model_call_details["custom_llm_provider"]

+    def should_run_prompt_management_hooks(
+        self,
+        non_default_params: Dict,
+        prompt_id: Optional[str] = None,
+    ) -> bool:
+        """
+        Return True if prompt management hooks should be run
+        """
+        if prompt_id:
+            return True
+        if AnthropicCacheControlHook.should_use_anthropic_cache_control_hook(
+            non_default_params
+        ):
+            return True
+        return False
+
    def get_chat_completion_prompt(
        self,
        model: str,
        messages: List[AllMessageValues],
-        non_default_params: dict,
-        prompt_id: str,
+        non_default_params: Dict,
+        prompt_id: Optional[str],
        prompt_variables: Optional[dict],
        prompt_management_logger: Optional[CustomLogger] = None,
    ) -> Tuple[str, List[AllMessageValues], dict]:
        custom_logger = (
            prompt_management_logger
-            or self.get_custom_logger_for_prompt_management(model)
+            or self.get_custom_logger_for_prompt_management(
+                model=model, non_default_params=non_default_params
+            )
        )
+
        if custom_logger:
            (
                model,
@ -476,7 +496,7 @@ class Logging(LiteLLMLoggingBaseClass):
            ) = custom_logger.get_chat_completion_prompt(
                model=model,
                messages=messages,
-                non_default_params=non_default_params,
+                non_default_params=non_default_params or {},
                prompt_id=prompt_id,
                prompt_variables=prompt_variables,
                dynamic_callback_params=self.standard_callback_dynamic_params,
@ -485,7 +505,7 @@ class Logging(LiteLLMLoggingBaseClass):
        return model, messages, non_default_params

    def get_custom_logger_for_prompt_management(
-        self, model: str
+        self, model: str, non_default_params: Dict
    ) -> Optional[CustomLogger]:
        """
        Get a custom logger for prompt management based on model name or available callbacks.
@ -520,6 +540,26 @@ class Logging(LiteLLMLoggingBaseClass):
            self.model_call_details["prompt_integration"] = logger.__class__.__name__
            return logger

+        if anthropic_cache_control_logger := AnthropicCacheControlHook.get_custom_logger_for_anthropic_cache_control_hook(
+            non_default_params
+        ):
+            self.model_call_details["prompt_integration"] = (
+                anthropic_cache_control_logger.__class__.__name__
+            )
+            return anthropic_cache_control_logger
+
+        return None
+
+    def get_custom_logger_for_anthropic_cache_control_hook(
+        self, non_default_params: Dict
+    ) -> Optional[CustomLogger]:
+        if non_default_params.get("cache_control_injection_points", None):
+            custom_logger = _init_custom_logger_compatible_class(
+                logging_integration="anthropic_cache_control_hook",
+                internal_usage_cache=None,
+                llm_router=None,
+            )
+            return custom_logger
        return None

    def _get_raw_request_body(self, data: Optional[Union[dict, str]]) -> dict:
@ -557,9 +597,9 @@ class Logging(LiteLLMLoggingBaseClass):
            model
        ):  # if model name was changes pre-call, overwrite the initial model call name with the new one
            self.model_call_details["model"] = model
-        self.model_call_details["litellm_params"][
-            "api_base"
-        ] = self._get_masked_api_base(additional_args.get("api_base", ""))
+        self.model_call_details["litellm_params"]["api_base"] = (
+            self._get_masked_api_base(additional_args.get("api_base", ""))
+        )

    def pre_call(self, input, api_key, model=None, additional_args={}):  # noqa: PLR0915
        # Log the exact input to the LLM API
@ -588,10 +628,10 @@ class Logging(LiteLLMLoggingBaseClass):
                try:
                    # [Non-blocking Extra Debug Information in metadata]
                    if turn_off_message_logging is True:
-                        _metadata[
-                            "raw_request"
-                        ] = "redacted by litellm. \
+                        _metadata["raw_request"] = (
+                            "redacted by litellm. \
                            'litellm.turn_off_message_logging=True'"
+                        )
                    else:
                        curl_command = self._get_request_curl_command(
                            api_base=additional_args.get("api_base", ""),
@ -602,32 +642,32 @@ class Logging(LiteLLMLoggingBaseClass):

                        _metadata["raw_request"] = str(curl_command)
                        # split up, so it's easier to parse in the UI
-                        self.model_call_details[
-                            "raw_request_typed_dict"
-                        ] = RawRequestTypedDict(
-                            raw_request_api_base=str(
-                                additional_args.get("api_base") or ""
-                            ),
-                            raw_request_body=self._get_raw_request_body(
-                                additional_args.get("complete_input_dict", {})
-                            ),
-                            raw_request_headers=self._get_masked_headers(
-                                additional_args.get("headers", {}) or {},
-                                ignore_sensitive_headers=True,
-                            ),
-                            error=None,
+                        self.model_call_details["raw_request_typed_dict"] = (
+                            RawRequestTypedDict(
+                                raw_request_api_base=str(
+                                    additional_args.get("api_base") or ""
+                                ),
+                                raw_request_body=self._get_raw_request_body(
+                                    additional_args.get("complete_input_dict", {})
+                                ),
+                                raw_request_headers=self._get_masked_headers(
+                                    additional_args.get("headers", {}) or {},
+                                    ignore_sensitive_headers=True,
+                                ),
+                                error=None,
+                            )
                        )
                except Exception as e:
-                    self.model_call_details[
-                        "raw_request_typed_dict"
-                    ] = RawRequestTypedDict(
-                        error=str(e),
+                    self.model_call_details["raw_request_typed_dict"] = (
+                        RawRequestTypedDict(
+                            error=str(e),
+                        )
                    )
-                    _metadata[
-                        "raw_request"
-                    ] = "Unable to Log \
+                    _metadata["raw_request"] = (
+                        "Unable to Log \
                        raw request: {}".format(
-                        str(e)
+                            str(e)
+                        )
                    )
            if self.logger_fn and callable(self.logger_fn):
                try:
@ -957,9 +997,9 @@ class Logging(LiteLLMLoggingBaseClass):
            verbose_logger.debug(
                f"response_cost_failure_debug_information: {debug_info}"
            )
-            self.model_call_details[
-                "response_cost_failure_debug_information"
-            ] = debug_info
+            self.model_call_details["response_cost_failure_debug_information"] = (
+                debug_info
+            )
            return None

        try:
@ -984,9 +1024,9 @@ class Logging(LiteLLMLoggingBaseClass):
            verbose_logger.debug(
                f"response_cost_failure_debug_information: {debug_info}"
            )
-            self.model_call_details[
-                "response_cost_failure_debug_information"
-            ] = debug_info
+            self.model_call_details["response_cost_failure_debug_information"] = (
+                debug_info
+            )

        return None

@ -1046,9 +1086,9 @@ class Logging(LiteLLMLoggingBaseClass):
                end_time = datetime.datetime.now()
            if self.completion_start_time is None:
                self.completion_start_time = end_time
-                self.model_call_details[
-                    "completion_start_time"
-                ] = self.completion_start_time
+                self.model_call_details["completion_start_time"] = (
+                    self.completion_start_time
+                )
            self.model_call_details["log_event_type"] = "successful_api_call"
            self.model_call_details["end_time"] = end_time
            self.model_call_details["cache_hit"] = cache_hit
@ -1127,39 +1167,39 @@ class Logging(LiteLLMLoggingBaseClass):
                            "response_cost"
                        ]
                    else:
-                        self.model_call_details[
-                            "response_cost"
-                        ] = self._response_cost_calculator(result=logging_result)
+                        self.model_call_details["response_cost"] = (
+                            self._response_cost_calculator(result=logging_result)
+                        )
                    ## STANDARDIZED LOGGING PAYLOAD

-                    self.model_call_details[
-                        "standard_logging_object"
-                    ] = get_standard_logging_object_payload(
-                        kwargs=self.model_call_details,
-                        init_response_obj=logging_result,
-                        start_time=start_time,
-                        end_time=end_time,
-                        logging_obj=self,
-                        status="success",
-                        standard_built_in_tools_params=self.standard_built_in_tools_params,
+                    self.model_call_details["standard_logging_object"] = (
+                        get_standard_logging_object_payload(
+                            kwargs=self.model_call_details,
+                            init_response_obj=logging_result,
+                            start_time=start_time,
+                            end_time=end_time,
+                            logging_obj=self,
+                            status="success",
+                            standard_built_in_tools_params=self.standard_built_in_tools_params,
+                        )
                    )
                elif isinstance(result, dict) or isinstance(result, list):
                    ## STANDARDIZED LOGGING PAYLOAD
-                    self.model_call_details[
-                        "standard_logging_object"
-                    ] = get_standard_logging_object_payload(
-                        kwargs=self.model_call_details,
-                        init_response_obj=result,
-                        start_time=start_time,
-                        end_time=end_time,
-                        logging_obj=self,
-                        status="success",
-                        standard_built_in_tools_params=self.standard_built_in_tools_params,
+                    self.model_call_details["standard_logging_object"] = (
+                        get_standard_logging_object_payload(
+                            kwargs=self.model_call_details,
+                            init_response_obj=result,
+                            start_time=start_time,
+                            end_time=end_time,
+                            logging_obj=self,
+                            status="success",
+                            standard_built_in_tools_params=self.standard_built_in_tools_params,
+                        )
                    )
            elif standard_logging_object is not None:
-                self.model_call_details[
-                    "standard_logging_object"
-                ] = standard_logging_object
+                self.model_call_details["standard_logging_object"] = (
+                    standard_logging_object
+                )
            else:  # streaming chunks + image gen.
                self.model_call_details["response_cost"] = None

@ -1215,23 +1255,23 @@ class Logging(LiteLLMLoggingBaseClass):
                verbose_logger.debug(
                    "Logging Details LiteLLM-Success Call streaming complete"
                )
-                self.model_call_details[
-                    "complete_streaming_response"
-                ] = complete_streaming_response
-                self.model_call_details[
-                    "response_cost"
-                ] = self._response_cost_calculator(result=complete_streaming_response)
+                self.model_call_details["complete_streaming_response"] = (
+                    complete_streaming_response
+                )
+                self.model_call_details["response_cost"] = (
+                    self._response_cost_calculator(result=complete_streaming_response)
+                )
                ## STANDARDIZED LOGGING PAYLOAD
-                self.model_call_details[
-                    "standard_logging_object"
-                ] = get_standard_logging_object_payload(
-                    kwargs=self.model_call_details,
-                    init_response_obj=complete_streaming_response,
-                    start_time=start_time,
-                    end_time=end_time,
-                    logging_obj=self,
-                    status="success",
-                    standard_built_in_tools_params=self.standard_built_in_tools_params,
+                self.model_call_details["standard_logging_object"] = (
+                    get_standard_logging_object_payload(
+                        kwargs=self.model_call_details,
+                        init_response_obj=complete_streaming_response,
+                        start_time=start_time,
+                        end_time=end_time,
+                        logging_obj=self,
+                        status="success",
+                        standard_built_in_tools_params=self.standard_built_in_tools_params,
+                    )
                )
            callbacks = self.get_combined_callback_list(
                dynamic_success_callbacks=self.dynamic_success_callbacks,
@ -1580,10 +1620,10 @@ class Logging(LiteLLMLoggingBaseClass):
                            )
                        else:
                            if self.stream and complete_streaming_response:
-                                self.model_call_details[
-                                    "complete_response"
-                                ] = self.model_call_details.get(
-                                    "complete_streaming_response", {}
+                                self.model_call_details["complete_response"] = (
+                                    self.model_call_details.get(
+                                        "complete_streaming_response", {}
+                                    )
                                )
                                result = self.model_call_details["complete_response"]
                            openMeterLogger.log_success_event(
@ -1623,10 +1663,10 @@ class Logging(LiteLLMLoggingBaseClass):
                            )
                        else:
                            if self.stream and complete_streaming_response:
-                                self.model_call_details[
-                                    "complete_response"
-                                ] = self.model_call_details.get(
-                                    "complete_streaming_response", {}
+                                self.model_call_details["complete_response"] = (
+                                    self.model_call_details.get(
+                                        "complete_streaming_response", {}
+                                    )
                                )
                                result = self.model_call_details["complete_response"]

@ -1733,9 +1773,9 @@ class Logging(LiteLLMLoggingBaseClass):
        if complete_streaming_response is not None:
            print_verbose("Async success callbacks: Got a complete streaming response")

-            self.model_call_details[
-                "async_complete_streaming_response"
-            ] = complete_streaming_response
+            self.model_call_details["async_complete_streaming_response"] = (
+                complete_streaming_response
+            )
            try:
                if self.model_call_details.get("cache_hit", False) is True:
                    self.model_call_details["response_cost"] = 0.0
@ -1745,10 +1785,10 @@ class Logging(LiteLLMLoggingBaseClass):
                        model_call_details=self.model_call_details
                    )
                    # base_model defaults to None if not set on model_info
-                    self.model_call_details[
-                        "response_cost"
-                    ] = self._response_cost_calculator(
-                        result=complete_streaming_response
+                    self.model_call_details["response_cost"] = (
+                        self._response_cost_calculator(
+                            result=complete_streaming_response
+                        )
                    )

                verbose_logger.debug(
@ -1761,16 +1801,16 @@ class Logging(LiteLLMLoggingBaseClass):
                self.model_call_details["response_cost"] = None

            ## STANDARDIZED LOGGING PAYLOAD
-            self.model_call_details[
-                "standard_logging_object"
-            ] = get_standard_logging_object_payload(
-                kwargs=self.model_call_details,
-                init_response_obj=complete_streaming_response,
-                start_time=start_time,
-                end_time=end_time,
-                logging_obj=self,
-                status="success",
-                standard_built_in_tools_params=self.standard_built_in_tools_params,
+            self.model_call_details["standard_logging_object"] = (
+                get_standard_logging_object_payload(
+                    kwargs=self.model_call_details,
+                    init_response_obj=complete_streaming_response,
+                    start_time=start_time,
+                    end_time=end_time,
+                    logging_obj=self,
+                    status="success",
+                    standard_built_in_tools_params=self.standard_built_in_tools_params,
+                )
            )
        callbacks = self.get_combined_callback_list(
            dynamic_success_callbacks=self.dynamic_async_success_callbacks,
@ -1976,18 +2016,18 @@ class Logging(LiteLLMLoggingBaseClass):

        ## STANDARDIZED LOGGING PAYLOAD

-        self.model_call_details[
-            "standard_logging_object"
-        ] = get_standard_logging_object_payload(
-            kwargs=self.model_call_details,
-            init_response_obj={},
-            start_time=start_time,
-            end_time=end_time,
-            logging_obj=self,
-            status="failure",
-            error_str=str(exception),
-            original_exception=exception,
-            standard_built_in_tools_params=self.standard_built_in_tools_params,
+        self.model_call_details["standard_logging_object"] = (
+            get_standard_logging_object_payload(
+                kwargs=self.model_call_details,
+                init_response_obj={},
+                start_time=start_time,
+                end_time=end_time,
+                logging_obj=self,
+                status="failure",
+                error_str=str(exception),
+                original_exception=exception,
+                standard_built_in_tools_params=self.standard_built_in_tools_params,
+            )
        )
        return start_time, end_time

@ -2753,9 +2793,9 @@ def _init_custom_logger_compatible_class(  # noqa: PLR0915
                endpoint=arize_config.endpoint,
            )

-            os.environ[
-                "OTEL_EXPORTER_OTLP_TRACES_HEADERS"
-            ] = f"space_key={arize_config.space_key},api_key={arize_config.api_key}"
+            os.environ["OTEL_EXPORTER_OTLP_TRACES_HEADERS"] = (
+                f"space_key={arize_config.space_key},api_key={arize_config.api_key}"
+            )
            for callback in _in_memory_loggers:
                if (
                    isinstance(callback, ArizeLogger)
@ -2779,9 +2819,9 @@ def _init_custom_logger_compatible_class(  # noqa: PLR0915

            # auth can be disabled on local deployments of arize phoenix
            if arize_phoenix_config.otlp_auth_headers is not None:
-                os.environ[
-                    "OTEL_EXPORTER_OTLP_TRACES_HEADERS"
-                ] = arize_phoenix_config.otlp_auth_headers
+                os.environ["OTEL_EXPORTER_OTLP_TRACES_HEADERS"] = (
+                    arize_phoenix_config.otlp_auth_headers
+                )

            for callback in _in_memory_loggers:
                if (
@ -2872,9 +2912,9 @@ def _init_custom_logger_compatible_class(  # noqa: PLR0915
                exporter="otlp_http",
                endpoint="https://langtrace.ai/api/trace",
            )
-            os.environ[
-                "OTEL_EXPORTER_OTLP_TRACES_HEADERS"
-            ] = f"api_key={os.getenv('LANGTRACE_API_KEY')}"
+            os.environ["OTEL_EXPORTER_OTLP_TRACES_HEADERS"] = (
+                f"api_key={os.getenv('LANGTRACE_API_KEY')}"
+            )
            for callback in _in_memory_loggers:
                if (
                    isinstance(callback, OpenTelemetry)
@ -2908,6 +2948,13 @@ def _init_custom_logger_compatible_class(  # noqa: PLR0915
            pagerduty_logger = PagerDutyAlerting(**custom_logger_init_args)
            _in_memory_loggers.append(pagerduty_logger)
            return pagerduty_logger  # type: ignore
+        elif logging_integration == "anthropic_cache_control_hook":
+            for callback in _in_memory_loggers:
+                if isinstance(callback, AnthropicCacheControlHook):
+                    return callback
+            anthropic_cache_control_hook = AnthropicCacheControlHook()
+            _in_memory_loggers.append(anthropic_cache_control_hook)
+            return anthropic_cache_control_hook  # type: ignore
        elif logging_integration == "gcs_pubsub":
            for callback in _in_memory_loggers:
                if isinstance(callback, GcsPubSubLogger):
@ -3046,6 +3093,10 @@ def get_custom_logger_compatible_class(  # noqa: PLR0915
            for callback in _in_memory_loggers:
                if isinstance(callback, PagerDutyAlerting):
                    return callback
+        elif logging_integration == "anthropic_cache_control_hook":
+            for callback in _in_memory_loggers:
+                if isinstance(callback, AnthropicCacheControlHook):
+                    return callback
        elif logging_integration == "gcs_pubsub":
            for callback in _in_memory_loggers:
                if isinstance(callback, GcsPubSubLogger):
@ -3369,10 +3420,10 @@ class StandardLoggingPayloadSetup:
            for key in StandardLoggingHiddenParams.__annotations__.keys():
                if key in hidden_params:
                    if key == "additional_headers":
-                        clean_hidden_params[
-                            "additional_headers"
-                        ] = StandardLoggingPayloadSetup.get_additional_headers(
-                            hidden_params[key]
+                        clean_hidden_params["additional_headers"] = (
+                            StandardLoggingPayloadSetup.get_additional_headers(
+                                hidden_params[key]
+                            )
                        )
                    else:
                        clean_hidden_params[key] = hidden_params[key]  # type: ignore
@ -3651,7 +3702,7 @@ def emit_standard_logging_payload(payload: StandardLoggingPayload):


 def get_standard_logging_metadata(
-    metadata: Optional[Dict[str, Any]]
+    metadata: Optional[Dict[str, Any]],
 ) -> StandardLoggingMetadata:
    """
    Clean and filter the metadata dictionary to include only the specified keys in StandardLoggingMetadata.
@ -3715,9 +3766,9 @@ def scrub_sensitive_keys_in_metadata(litellm_params: Optional[dict]):
    ):
        for k, v in metadata["user_api_key_metadata"].items():
            if k == "logging":  # prevent logging user logging keys
-                cleaned_user_api_key_metadata[
-                    k
-                ] = "scrubbed_by_litellm_for_sensitive_keys"
+                cleaned_user_api_key_metadata[k] = (
+                    "scrubbed_by_litellm_for_sensitive_keys"
+                )
            else:
                cleaned_user_api_key_metadata[k] = v

--- a/litellm/litellm_core_utils/llm_cost_calc/utils.py
+++ b/litellm/litellm_core_utils/llm_cost_calc/utils.py
@ -265,8 +265,10 @@ def generic_cost_per_token(
    )

    ## CALCULATE OUTPUT COST
-    text_tokens = usage.completion_tokens
+    text_tokens = 0
    audio_tokens = 0
+    reasoning_tokens = 0
+    is_text_tokens_total = False
    if usage.completion_tokens_details is not None:
        audio_tokens = (
            cast(
@ -280,9 +282,20 @@ def generic_cost_per_token(
                Optional[int],
                getattr(usage.completion_tokens_details, "text_tokens", None),
            )
-            or usage.completion_tokens  # default to completion tokens, if this field is not set
+            or 0  # default to completion tokens, if this field is not set
+        )
+        reasoning_tokens = (
+            cast(
+                Optional[int],
+                getattr(usage.completion_tokens_details, "reasoning_tokens", 0),
+            )
+            or 0
        )

+    if text_tokens == 0:
+        text_tokens = usage.completion_tokens
+    if text_tokens == usage.completion_tokens:
+        is_text_tokens_total = True
    ## TEXT COST
    completion_cost = float(text_tokens) * completion_base_cost

@ -290,12 +303,26 @@ def generic_cost_per_token(
        "output_cost_per_audio_token"
    )

+    _output_cost_per_reasoning_token: Optional[float] = model_info.get(
+        "output_cost_per_reasoning_token"
+    )
+
    ## AUDIO COST
-    if (
-        _output_cost_per_audio_token is not None
-        and audio_tokens is not None
-        and audio_tokens > 0
-    ):
+    if not is_text_tokens_total and audio_tokens is not None and audio_tokens > 0:
+        _output_cost_per_audio_token = (
+            _output_cost_per_audio_token
+            if _output_cost_per_audio_token is not None
+            else completion_base_cost
+        )
        completion_cost += float(audio_tokens) * _output_cost_per_audio_token

+    ## REASONING COST
+    if not is_text_tokens_total and reasoning_tokens and reasoning_tokens > 0:
+        _output_cost_per_reasoning_token = (
+            _output_cost_per_reasoning_token
+            if _output_cost_per_reasoning_token is not None
+            else completion_base_cost
+        )
+        completion_cost += float(reasoning_tokens) * _output_cost_per_reasoning_token
+
    return prompt_cost, completion_cost
--- a/litellm/litellm_core_utils/llm_response_utils/convert_dict_to_response.py
+++ b/litellm/litellm_core_utils/llm_response_utils/convert_dict_to_response.py
@ -14,6 +14,7 @@ from litellm.types.llms.openai import ChatCompletionThinkingBlock
 from litellm.types.utils import (
    ChatCompletionDeltaToolCall,
    ChatCompletionMessageToolCall,
+    ChatCompletionRedactedThinkingBlock,
    Choices,
    Delta,
    EmbeddingResponse,
@ -486,7 +487,14 @@ def convert_to_model_response_object(  # noqa: PLR0915
                    )

                    # Handle thinking models that display `thinking_blocks` within `content`
-                    thinking_blocks: Optional[List[ChatCompletionThinkingBlock]] = None
+                    thinking_blocks: Optional[
+                        List[
+                            Union[
+                                ChatCompletionThinkingBlock,
+                                ChatCompletionRedactedThinkingBlock,
+                            ]
+                        ]
+                    ] = None
                    if "thinking_blocks" in choice["message"]:
                        thinking_blocks = choice["message"]["thinking_blocks"]
                        provider_specific_fields["thinking_blocks"] = thinking_blocks
--- a/litellm/litellm_core_utils/prompt_templates/common_utils.py
+++ b/litellm/litellm_core_utils/prompt_templates/common_utils.py
@ -313,13 +313,20 @@ def get_format_from_file_id(file_id: Optional[str]) -> Optional[str]:
    unified_file_id = litellm_proxy:{};unified_id,{}
    If not a unified file id, returns 'file' as default format
    """
+    from litellm.proxy.hooks.managed_files import _PROXY_LiteLLMManagedFiles
+
    if not file_id:
        return None
    try:
-        if file_id.startswith(SpecialEnums.LITELM_MANAGED_FILE_ID_PREFIX.value):
+        transformed_file_id = (
+            _PROXY_LiteLLMManagedFiles._convert_b64_uid_to_unified_uid(file_id)
+        )
+        if transformed_file_id.startswith(
+            SpecialEnums.LITELM_MANAGED_FILE_ID_PREFIX.value
+        ):
            match = re.match(
                f"{SpecialEnums.LITELM_MANAGED_FILE_ID_PREFIX.value}:(.*?);unified_id",
-                file_id,
+                transformed_file_id,
            )
            if match:
                return match.group(1)
@ -343,6 +350,7 @@ def update_messages_with_model_file_ids(
        }
    }
    """
+
    for message in messages:
        if message.get("role") == "user":
            content = message.get("content")
@ -463,3 +471,59 @@ def unpack_defs(schema, defs):
                unpack_defs(ref, defs)
                value["items"] = ref
                continue
+
+
+def _get_image_mime_type_from_url(url: str) -> Optional[str]:
+    """
+    Get mime type for common image URLs
+    See gemini mime types: https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal/image-understanding#image-requirements
+
+    Supported by Gemini:
+     application/pdf
+    audio/mpeg
+    audio/mp3
+    audio/wav
+    image/png
+    image/jpeg
+    image/webp
+    text/plain
+    video/mov
+    video/mpeg
+    video/mp4
+    video/mpg
+    video/avi
+    video/wmv
+    video/mpegps
+    video/flv
+    """
+    url = url.lower()
+
+    # Map file extensions to mime types
+    mime_types = {
+        # Images
+        (".jpg", ".jpeg"): "image/jpeg",
+        (".png",): "image/png",
+        (".webp",): "image/webp",
+        # Videos
+        (".mp4",): "video/mp4",
+        (".mov",): "video/mov",
+        (".mpeg", ".mpg"): "video/mpeg",
+        (".avi",): "video/avi",
+        (".wmv",): "video/wmv",
+        (".mpegps",): "video/mpegps",
+        (".flv",): "video/flv",
+        # Audio
+        (".mp3",): "audio/mp3",
+        (".wav",): "audio/wav",
+        (".mpeg",): "audio/mpeg",
+        # Documents
+        (".pdf",): "application/pdf",
+        (".txt",): "text/plain",
+    }
+
+    # Check each extension group against the URL
+    for extensions, mime_type in mime_types.items():
+        if any(url.endswith(ext) for ext in extensions):
+            return mime_type
+
+    return None
--- a/litellm/litellm_core_utils/prompt_templates/factory.py
+++ b/litellm/litellm_core_utils/prompt_templates/factory.py
@ -250,7 +250,7 @@ def ollama_pt(
                    f"Tool Calls: {json.dumps(ollama_tool_calls, indent=2)}"
                )

-            msg_i += 1
+                msg_i += 1

        if assistant_content_str:
            prompt += f"### Assistant:\n{assistant_content_str}\n\n"
@ -2258,6 +2258,14 @@ def _parse_content_type(content_type: str) -> str:
    return m.get_content_type()


+def _parse_mime_type(base64_data: str) -> Optional[str]:
+    mime_type_match = re.match(r"data:(.*?);base64", base64_data)
+    if mime_type_match:
+        return mime_type_match.group(1)
+    else:
+        return None
+
+
 class BedrockImageProcessor:
    """Handles both sync and async image processing for Bedrock conversations."""

--- a/litellm/litellm_core_utils/streaming_chunk_builder_utils.py
+++ b/litellm/litellm_core_utils/streaming_chunk_builder_utils.py
@ -106,74 +106,63 @@ class ChunkProcessor:
    def get_combined_tool_content(
        self, tool_call_chunks: List[Dict[str, Any]]
    ) -> List[ChatCompletionMessageToolCall]:
-        argument_list: List[str] = []
-        delta = tool_call_chunks[0]["choices"][0]["delta"]
-        id = None
-        name = None
-        type = None
        tool_calls_list: List[ChatCompletionMessageToolCall] = []
-        prev_index = None
-        prev_name = None
-        prev_id = None
-        curr_id = None
-        curr_index = 0
+        tool_call_map: Dict[
+            int, Dict[str, Any]
+        ] = {}  # Map to store tool calls by index
+
        for chunk in tool_call_chunks:
            choices = chunk["choices"]
            for choice in choices:
                delta = choice.get("delta", {})
-                tool_calls = delta.get("tool_calls", "")
-                # Check if a tool call is present
-                if tool_calls and tool_calls[0].function is not None:
-                    if tool_calls[0].id:
-                        id = tool_calls[0].id
-                        curr_id = id
-                        if prev_id is None:
-                            prev_id = curr_id
-                    if tool_calls[0].index:
-                        curr_index = tool_calls[0].index
-                    if tool_calls[0].function.arguments:
-                        # Now, tool_calls is expected to be a dictionary
-                        arguments = tool_calls[0].function.arguments
-                        argument_list.append(arguments)
-                    if tool_calls[0].function.name:
-                        name = tool_calls[0].function.name
-                    if tool_calls[0].type:
-                        type = tool_calls[0].type
-            if prev_index is None:
-                prev_index = curr_index
-            if prev_name is None:
-                prev_name = name
-            if curr_index != prev_index:  # new tool call
-                combined_arguments = "".join(argument_list)
+                tool_calls = delta.get("tool_calls", [])
+
+                for tool_call in tool_calls:
+                    if not tool_call or not hasattr(tool_call, "function"):
+                        continue
+
+                    index = getattr(tool_call, "index", 0)
+                    if index not in tool_call_map:
+                        tool_call_map[index] = {
+                            "id": None,
+                            "name": None,
+                            "type": None,
+                            "arguments": [],
+                        }
+
+                    if hasattr(tool_call, "id") and tool_call.id:
+                        tool_call_map[index]["id"] = tool_call.id
+                    if hasattr(tool_call, "type") and tool_call.type:
+                        tool_call_map[index]["type"] = tool_call.type
+                    if hasattr(tool_call, "function"):
+                        if (
+                            hasattr(tool_call.function, "name")
+                            and tool_call.function.name
+                        ):
+                            tool_call_map[index]["name"] = tool_call.function.name
+                        if (
+                            hasattr(tool_call.function, "arguments")
+                            and tool_call.function.arguments
+                        ):
+                            tool_call_map[index]["arguments"].append(
+                                tool_call.function.arguments
+                            )
+
+        # Convert the map to a list of tool calls
+        for index in sorted(tool_call_map.keys()):
+            tool_call_data = tool_call_map[index]
+            if tool_call_data["id"] and tool_call_data["name"]:
+                combined_arguments = "".join(tool_call_data["arguments"]) or "{}"
                tool_calls_list.append(
                    ChatCompletionMessageToolCall(
-                        id=prev_id,
+                        id=tool_call_data["id"],
                        function=Function(
                            arguments=combined_arguments,
-                            name=prev_name,
+                            name=tool_call_data["name"],
                        ),
-                        type=type,
+                        type=tool_call_data["type"] or "function",
                    )
                )
-                argument_list = []  # reset
-                prev_index = curr_index
-                prev_id = curr_id
-                prev_name = name
-
-        combined_arguments = (
-            "".join(argument_list) or "{}"
-        )  # base case, return empty dict
-
-        tool_calls_list.append(
-            ChatCompletionMessageToolCall(
-                id=id,
-                type="function",
-                function=Function(
-                    arguments=combined_arguments,
-                    name=name,
-                ),
-            )
-        )

        return tool_calls_list

--- a/litellm/llms/anthropic/chat/handler.py
+++ b/litellm/llms/anthropic/chat/handler.py
@ -29,6 +29,7 @@ from litellm.types.llms.anthropic import (
    UsageDelta,
 )
 from litellm.types.llms.openai import (
+    ChatCompletionRedactedThinkingBlock,
    ChatCompletionThinkingBlock,
    ChatCompletionToolCallChunk,
 )
@ -501,18 +502,19 @@ class ModelResponseIterator:
    ) -> Tuple[
        str,
        Optional[ChatCompletionToolCallChunk],
-        List[ChatCompletionThinkingBlock],
+        List[Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]],
        Dict[str, Any],
    ]:
        """
        Helper function to handle the content block delta
        """
-
        text = ""
        tool_use: Optional[ChatCompletionToolCallChunk] = None
        provider_specific_fields = {}
        content_block = ContentBlockDelta(**chunk)  # type: ignore
-        thinking_blocks: List[ChatCompletionThinkingBlock] = []
+        thinking_blocks: List[
+            Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]
+        ] = []

        self.content_blocks.append(content_block)
        if "text" in content_block["delta"]:
@ -541,20 +543,25 @@ class ModelResponseIterator:
                )
            ]
            provider_specific_fields["thinking_blocks"] = thinking_blocks
+
        return text, tool_use, thinking_blocks, provider_specific_fields

    def _handle_reasoning_content(
-        self, thinking_blocks: List[ChatCompletionThinkingBlock]
+        self,
+        thinking_blocks: List[
+            Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]
+        ],
    ) -> Optional[str]:
        """
        Handle the reasoning content
        """
        reasoning_content = None
        for block in thinking_blocks:
+            thinking_content = cast(Optional[str], block.get("thinking"))
            if reasoning_content is None:
                reasoning_content = ""
-            if "thinking" in block:
-                reasoning_content += block["thinking"]
+            if thinking_content is not None:
+                reasoning_content += thinking_content
        return reasoning_content

    def chunk_parser(self, chunk: dict) -> ModelResponseStream:
@ -567,7 +574,13 @@ class ModelResponseIterator:
            usage: Optional[Usage] = None
            provider_specific_fields: Dict[str, Any] = {}
            reasoning_content: Optional[str] = None
-            thinking_blocks: Optional[List[ChatCompletionThinkingBlock]] = None
+            thinking_blocks: Optional[
+                List[
+                    Union[
+                        ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock
+                    ]
+                ]
+            ] = None

            index = int(chunk.get("index", 0))
            if type_chunk == "content_block_delta":
@ -605,6 +618,15 @@ class ModelResponseIterator:
                        },
                        "index": self.tool_index,
                    }
+                elif (
+                    content_block_start["content_block"]["type"] == "redacted_thinking"
+                ):
+                    thinking_blocks = [
+                        ChatCompletionRedactedThinkingBlock(
+                            type="redacted_thinking",
+                            data=content_block_start["content_block"]["data"],
+                        )
+                    ]
            elif type_chunk == "content_block_stop":
                ContentBlockStop(**chunk)  # type: ignore
                # check if tool call content block
--- a/litellm/llms/anthropic/chat/transformation.py
+++ b/litellm/llms/anthropic/chat/transformation.py
@ -7,6 +7,9 @@ import httpx
 import litellm
 from litellm.constants import (
    DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS,
+    DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET,
+    DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET,
+    DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET,
    RESPONSE_FORMAT_TOOL_NAME,
 )
 from litellm.litellm_core_utils.core_helpers import map_finish_reason
@ -27,6 +30,7 @@ from litellm.types.llms.openai import (
    REASONING_EFFORT,
    AllMessageValues,
    ChatCompletionCachedContent,
+    ChatCompletionRedactedThinkingBlock,
    ChatCompletionSystemMessage,
    ChatCompletionThinkingBlock,
    ChatCompletionToolCallChunk,
@ -44,7 +48,7 @@ from litellm.utils import (
    token_counter,
 )

-from ..common_utils import AnthropicError, process_anthropic_headers
+from ..common_utils import AnthropicError, AnthropicModelInfo, process_anthropic_headers

 if TYPE_CHECKING:
    from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
@ -54,7 +58,7 @@ else:
    LoggingClass = Any


-class AnthropicConfig(BaseConfig):
+class AnthropicConfig(AnthropicModelInfo, BaseConfig):
    """
    Reference: https://docs.anthropic.com/claude/reference/messages_post

@ -127,41 +131,6 @@ class AnthropicConfig(BaseConfig):
            "anthropic-beta": "prompt-caching-2024-07-31",
        }

-    def get_anthropic_headers(
-        self,
-        api_key: str,
-        anthropic_version: Optional[str] = None,
-        computer_tool_used: bool = False,
-        prompt_caching_set: bool = False,
-        pdf_used: bool = False,
-        is_vertex_request: bool = False,
-        user_anthropic_beta_headers: Optional[List[str]] = None,
-    ) -> dict:
-        betas = set()
-        if prompt_caching_set:
-            betas.add("prompt-caching-2024-07-31")
-        if computer_tool_used:
-            betas.add("computer-use-2024-10-22")
-        if pdf_used:
-            betas.add("pdfs-2024-09-25")
-        headers = {
-            "anthropic-version": anthropic_version or "2023-06-01",
-            "x-api-key": api_key,
-            "accept": "application/json",
-            "content-type": "application/json",
-        }
-
-        if user_anthropic_beta_headers is not None:
-            betas.update(user_anthropic_beta_headers)
-
-        # Don't send any beta headers to Vertex, Vertex has failed requests when they are sent
-        if is_vertex_request is True:
-            pass
-        elif len(betas) > 0:
-            headers["anthropic-beta"] = ",".join(betas)
-
-        return headers
-
    def _map_tool_choice(
        self, tool_choice: Optional[str], parallel_tool_use: Optional[bool]
    ) -> Optional[AnthropicMessagesToolChoice]:
@ -311,11 +280,20 @@ class AnthropicConfig(BaseConfig):
        if reasoning_effort is None:
            return None
        elif reasoning_effort == "low":
-            return AnthropicThinkingParam(type="enabled", budget_tokens=1024)
+            return AnthropicThinkingParam(
+                type="enabled",
+                budget_tokens=DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET,
+            )
        elif reasoning_effort == "medium":
-            return AnthropicThinkingParam(type="enabled", budget_tokens=2048)
+            return AnthropicThinkingParam(
+                type="enabled",
+                budget_tokens=DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET,
+            )
        elif reasoning_effort == "high":
-            return AnthropicThinkingParam(type="enabled", budget_tokens=4096)
+            return AnthropicThinkingParam(
+                type="enabled",
+                budget_tokens=DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET,
+            )
        else:
            raise ValueError(f"Unmapped reasoning effort: {reasoning_effort}")

@ -446,49 +424,6 @@ class AnthropicConfig(BaseConfig):
        )
        return _tool

-    def is_cache_control_set(self, messages: List[AllMessageValues]) -> bool:
-        """
-        Return if {"cache_control": ..} in message content block
-
-        Used to check if anthropic prompt caching headers need to be set.
-        """
-        for message in messages:
-            if message.get("cache_control", None) is not None:
-                return True
-            _message_content = message.get("content")
-            if _message_content is not None and isinstance(_message_content, list):
-                for content in _message_content:
-                    if "cache_control" in content:
-                        return True
-
-        return False
-
-    def is_computer_tool_used(
-        self, tools: Optional[List[AllAnthropicToolsValues]]
-    ) -> bool:
-        if tools is None:
-            return False
-        for tool in tools:
-            if "type" in tool and tool["type"].startswith("computer_"):
-                return True
-        return False
-
-    def is_pdf_used(self, messages: List[AllMessageValues]) -> bool:
-        """
-        Set to true if media passed into messages.
-
-        """
-        for message in messages:
-            if (
-                "content" in message
-                and message["content"] is not None
-                and isinstance(message["content"], list)
-            ):
-                for content in message["content"]:
-                    if "type" in content and content["type"] != "text":
-                        return True
-        return False
-
    def translate_system_message(
        self, messages: List[AllMessageValues]
    ) -> List[AnthropicSystemMessageContent]:
@ -641,13 +576,21 @@ class AnthropicConfig(BaseConfig):
    ) -> Tuple[
        str,
        Optional[List[Any]],
-        Optional[List[ChatCompletionThinkingBlock]],
+        Optional[
+            List[
+                Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]
+            ]
+        ],
        Optional[str],
        List[ChatCompletionToolCallChunk],
    ]:
        text_content = ""
        citations: Optional[List[Any]] = None
-        thinking_blocks: Optional[List[ChatCompletionThinkingBlock]] = None
+        thinking_blocks: Optional[
+            List[
+                Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]
+            ]
+        ] = None
        reasoning_content: Optional[str] = None
        tool_calls: List[ChatCompletionToolCallChunk] = []
        for idx, content in enumerate(completion_response["content"]):
@ -666,20 +609,30 @@ class AnthropicConfig(BaseConfig):
                        index=idx,
                    )
                )
-            ## CITATIONS
-            if content.get("citations", None) is not None:
-                if citations is None:
-                    citations = []
-                citations.append(content["citations"])
-            if content.get("thinking", None) is not None:
+
+            elif content.get("thinking", None) is not None:
                if thinking_blocks is None:
                    thinking_blocks = []
                thinking_blocks.append(cast(ChatCompletionThinkingBlock, content))
+            elif content["type"] == "redacted_thinking":
+                if thinking_blocks is None:
+                    thinking_blocks = []
+                thinking_blocks.append(
+                    cast(ChatCompletionRedactedThinkingBlock, content)
+                )
+
+            ## CITATIONS
+            if content.get("citations") is not None:
+                if citations is None:
+                    citations = []
+                citations.append(content["citations"])
        if thinking_blocks is not None:
            reasoning_content = ""
            for block in thinking_blocks:
-                if "thinking" in block:
-                    reasoning_content += block["thinking"]
+                thinking_content = cast(Optional[str], block.get("thinking"))
+                if thinking_content is not None:
+                    reasoning_content += thinking_content
+
        return text_content, citations, thinking_blocks, reasoning_content, tool_calls

    def calculate_usage(
@ -769,7 +722,13 @@ class AnthropicConfig(BaseConfig):
        else:
            text_content = ""
            citations: Optional[List[Any]] = None
-            thinking_blocks: Optional[List[ChatCompletionThinkingBlock]] = None
+            thinking_blocks: Optional[
+                List[
+                    Union[
+                        ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock
+                    ]
+                ]
+            ] = None
            reasoning_content: Optional[str] = None
            tool_calls: List[ChatCompletionToolCallChunk] = []

@ -862,47 +821,3 @@ class AnthropicConfig(BaseConfig):
            message=error_message,
            headers=cast(httpx.Headers, headers),
        )
-
-    def _get_user_anthropic_beta_headers(
-        self, anthropic_beta_header: Optional[str]
-    ) -> Optional[List[str]]:
-        if anthropic_beta_header is None:
-            return None
-        return anthropic_beta_header.split(",")
-
-    def validate_environment(
-        self,
-        headers: dict,
-        model: str,
-        messages: List[AllMessageValues],
-        optional_params: dict,
-        litellm_params: dict,
-        api_key: Optional[str] = None,
-        api_base: Optional[str] = None,
-    ) -> Dict:
-        if api_key is None:
-            raise litellm.AuthenticationError(
-                message="Missing Anthropic API Key - A call is being made to anthropic but no key is set either in the environment variables or via params. Please set `ANTHROPIC_API_KEY` in your environment vars",
-                llm_provider="anthropic",
-                model=model,
-            )
-
-        tools = optional_params.get("tools")
-        prompt_caching_set = self.is_cache_control_set(messages=messages)
-        computer_tool_used = self.is_computer_tool_used(tools=tools)
-        pdf_used = self.is_pdf_used(messages=messages)
-        user_anthropic_beta_headers = self._get_user_anthropic_beta_headers(
-            anthropic_beta_header=headers.get("anthropic-beta")
-        )
-        anthropic_headers = self.get_anthropic_headers(
-            computer_tool_used=computer_tool_used,
-            prompt_caching_set=prompt_caching_set,
-            pdf_used=pdf_used,
-            api_key=api_key,
-            is_vertex_request=optional_params.get("is_vertex_request", False),
-            user_anthropic_beta_headers=user_anthropic_beta_headers,
-        )
-
-        headers = {**headers, **anthropic_headers}
-
-        return headers
--- a/litellm/llms/anthropic/common_utils.py
+++ b/litellm/llms/anthropic/common_utils.py
@ -2,7 +2,7 @@
 This file contains common utils for anthropic calls.
 """

-from typing import List, Optional, Union
+from typing import Dict, List, Optional, Union

 import httpx

@ -10,6 +10,8 @@ import litellm
 from litellm.llms.base_llm.base_utils import BaseLLMModelInfo
 from litellm.llms.base_llm.chat.transformation import BaseLLMException
 from litellm.secret_managers.main import get_secret_str
+from litellm.types.llms.anthropic import AllAnthropicToolsValues
+from litellm.types.llms.openai import AllMessageValues


 class AnthropicError(BaseLLMException):
@ -23,6 +25,128 @@ class AnthropicError(BaseLLMException):


 class AnthropicModelInfo(BaseLLMModelInfo):
+    def is_cache_control_set(self, messages: List[AllMessageValues]) -> bool:
+        """
+        Return if {"cache_control": ..} in message content block
+
+        Used to check if anthropic prompt caching headers need to be set.
+        """
+        for message in messages:
+            if message.get("cache_control", None) is not None:
+                return True
+            _message_content = message.get("content")
+            if _message_content is not None and isinstance(_message_content, list):
+                for content in _message_content:
+                    if "cache_control" in content:
+                        return True
+
+        return False
+
+    def is_computer_tool_used(
+        self, tools: Optional[List[AllAnthropicToolsValues]]
+    ) -> bool:
+        if tools is None:
+            return False
+        for tool in tools:
+            if "type" in tool and tool["type"].startswith("computer_"):
+                return True
+        return False
+
+    def is_pdf_used(self, messages: List[AllMessageValues]) -> bool:
+        """
+        Set to true if media passed into messages.
+
+        """
+        for message in messages:
+            if (
+                "content" in message
+                and message["content"] is not None
+                and isinstance(message["content"], list)
+            ):
+                for content in message["content"]:
+                    if "type" in content and content["type"] != "text":
+                        return True
+        return False
+
+    def _get_user_anthropic_beta_headers(
+        self, anthropic_beta_header: Optional[str]
+    ) -> Optional[List[str]]:
+        if anthropic_beta_header is None:
+            return None
+        return anthropic_beta_header.split(",")
+
+    def get_anthropic_headers(
+        self,
+        api_key: str,
+        anthropic_version: Optional[str] = None,
+        computer_tool_used: bool = False,
+        prompt_caching_set: bool = False,
+        pdf_used: bool = False,
+        is_vertex_request: bool = False,
+        user_anthropic_beta_headers: Optional[List[str]] = None,
+    ) -> dict:
+        betas = set()
+        if prompt_caching_set:
+            betas.add("prompt-caching-2024-07-31")
+        if computer_tool_used:
+            betas.add("computer-use-2024-10-22")
+        if pdf_used:
+            betas.add("pdfs-2024-09-25")
+        headers = {
+            "anthropic-version": anthropic_version or "2023-06-01",
+            "x-api-key": api_key,
+            "accept": "application/json",
+            "content-type": "application/json",
+        }
+
+        if user_anthropic_beta_headers is not None:
+            betas.update(user_anthropic_beta_headers)
+
+        # Don't send any beta headers to Vertex, Vertex has failed requests when they are sent
+        if is_vertex_request is True:
+            pass
+        elif len(betas) > 0:
+            headers["anthropic-beta"] = ",".join(betas)
+
+        return headers
+
+    def validate_environment(
+        self,
+        headers: dict,
+        model: str,
+        messages: List[AllMessageValues],
+        optional_params: dict,
+        litellm_params: dict,
+        api_key: Optional[str] = None,
+        api_base: Optional[str] = None,
+    ) -> Dict:
+        if api_key is None:
+            raise litellm.AuthenticationError(
+                message="Missing Anthropic API Key - A call is being made to anthropic but no key is set either in the environment variables or via params. Please set `ANTHROPIC_API_KEY` in your environment vars",
+                llm_provider="anthropic",
+                model=model,
+            )
+
+        tools = optional_params.get("tools")
+        prompt_caching_set = self.is_cache_control_set(messages=messages)
+        computer_tool_used = self.is_computer_tool_used(tools=tools)
+        pdf_used = self.is_pdf_used(messages=messages)
+        user_anthropic_beta_headers = self._get_user_anthropic_beta_headers(
+            anthropic_beta_header=headers.get("anthropic-beta")
+        )
+        anthropic_headers = self.get_anthropic_headers(
+            computer_tool_used=computer_tool_used,
+            prompt_caching_set=prompt_caching_set,
+            pdf_used=pdf_used,
+            api_key=api_key,
+            is_vertex_request=optional_params.get("is_vertex_request", False),
+            user_anthropic_beta_headers=user_anthropic_beta_headers,
+        )
+
+        headers = {**headers, **anthropic_headers}
+
+        return headers
+
    @staticmethod
    def get_api_base(api_base: Optional[str] = None) -> Optional[str]:
        return (
--- a/litellm/llms/azure/assistants.py
+++ b/litellm/llms/azure/assistants.py
@ -288,6 +288,7 @@ class AzureAssistantsAPI(BaseAzureLLM):
            timeout=timeout,
            max_retries=max_retries,
            client=client,
+            litellm_params=litellm_params,
        )

        thread_message: OpenAIMessage = openai_client.beta.threads.messages.create(  # type: ignore
--- a/litellm/llms/azure/chat/gpt_transformation.py
+++ b/litellm/llms/azure/chat/gpt_transformation.py
@ -125,14 +125,22 @@ class AzureOpenAIConfig(BaseConfig):
    ) -> bool:
        """
        - check if api_version is supported for response_format
+        - returns True if the API version is equal to or newer than the supported version
        """
+        api_year = int(api_version_year)
+        api_month = int(api_version_month)
+        supported_year = int(API_VERSION_YEAR_SUPPORTED_RESPONSE_FORMAT)
+        supported_month = int(API_VERSION_MONTH_SUPPORTED_RESPONSE_FORMAT)

-        is_supported = (
-            int(api_version_year) <= API_VERSION_YEAR_SUPPORTED_RESPONSE_FORMAT
-            and int(api_version_month) >= API_VERSION_MONTH_SUPPORTED_RESPONSE_FORMAT
-        )
-
-        return is_supported
+        # If the year is greater than supported year, it's definitely supported
+        if api_year > supported_year:
+            return True
+        # If the year is less than supported year, it's not supported
+        elif api_year < supported_year:
+            return False
+        # If same year, check if month is >= supported month
+        else:
+            return api_month >= supported_month

    def map_openai_params(
        self,
@ -202,6 +210,7 @@ class AzureOpenAIConfig(BaseConfig):
                    is_response_format_supported_api_version
                    and _is_response_format_supported_model
                )
+
                optional_params = self._add_response_format_to_tools(
                    optional_params=optional_params,
                    value=value,
--- a/litellm/llms/azure/chat/o_series_transformation.py
+++ b/litellm/llms/azure/chat/o_series_transformation.py
@ -79,7 +79,7 @@ class AzureOpenAIO1Config(OpenAIOSeriesConfig):
        return True

    def is_o_series_model(self, model: str) -> bool:
-        return "o1" in model or "o3" in model or "o_series/" in model
+        return "o1" in model or "o3" in model or "o4" in model or "o_series/" in model

    def transform_request(
        self,
--- a/litellm/llms/azure/common_utils.py
+++ b/litellm/llms/azure/common_utils.py
@ -61,7 +61,7 @@ def process_azure_headers(headers: Union[httpx.Headers, dict]) -> dict:
    return {**llm_response_headers, **openai_headers}


-def get_azure_ad_token_from_entrata_id(
+def get_azure_ad_token_from_entra_id(
    tenant_id: str,
    client_id: str,
    client_secret: str,
@ -81,7 +81,7 @@ def get_azure_ad_token_from_entrata_id(
    """
    from azure.identity import ClientSecretCredential, get_bearer_token_provider

-    verbose_logger.debug("Getting Azure AD Token from Entrata ID")
+    verbose_logger.debug("Getting Azure AD Token from Entra ID")

    if tenant_id.startswith("os.environ/"):
        _tenant_id = get_secret_str(tenant_id)
@ -309,21 +309,30 @@ class BaseAzureLLM(BaseOpenAILLM):
        azure_ad_token_provider: Optional[Callable[[], str]] = None
        # If we have api_key, then we have higher priority
        azure_ad_token = litellm_params.get("azure_ad_token")
-        tenant_id = litellm_params.get("tenant_id")
-        client_id = litellm_params.get("client_id")
-        client_secret = litellm_params.get("client_secret")
-        azure_username = litellm_params.get("azure_username")
-        azure_password = litellm_params.get("azure_password")
+        tenant_id = litellm_params.get("tenant_id", os.getenv("AZURE_TENANT_ID"))
+        client_id = litellm_params.get("client_id", os.getenv("AZURE_CLIENT_ID"))
+        client_secret = litellm_params.get(
+            "client_secret", os.getenv("AZURE_CLIENT_SECRET")
+        )
+        azure_username = litellm_params.get(
+            "azure_username", os.getenv("AZURE_USERNAME")
+        )
+        azure_password = litellm_params.get(
+            "azure_password", os.getenv("AZURE_PASSWORD")
+        )
        max_retries = litellm_params.get("max_retries")
        timeout = litellm_params.get("timeout")
        if not api_key and tenant_id and client_id and client_secret:
-            verbose_logger.debug("Using Azure AD Token Provider for Azure Auth")
-            azure_ad_token_provider = get_azure_ad_token_from_entrata_id(
+            verbose_logger.debug(
+                "Using Azure AD Token Provider from Entra ID for Azure Auth"
+            )
+            azure_ad_token_provider = get_azure_ad_token_from_entra_id(
                tenant_id=tenant_id,
                client_id=client_id,
                client_secret=client_secret,
            )
        if azure_username and azure_password and client_id:
+            verbose_logger.debug("Using Azure Username and Password for Azure Auth")
            azure_ad_token_provider = get_azure_ad_token_from_username_password(
                azure_username=azure_username,
                azure_password=azure_password,
@ -331,12 +340,16 @@ class BaseAzureLLM(BaseOpenAILLM):
            )

        if azure_ad_token is not None and azure_ad_token.startswith("oidc/"):
+            verbose_logger.debug("Using Azure OIDC Token for Azure Auth")
            azure_ad_token = get_azure_ad_token_from_oidc(azure_ad_token)
        elif (
            not api_key
            and azure_ad_token_provider is None
            and litellm.enable_azure_ad_token_refresh is True
        ):
+            verbose_logger.debug(
+                "Using Azure AD token provider based on Service Principal with Secret workflow for Azure Auth"
+            )
            try:
                azure_ad_token_provider = get_azure_ad_token_provider()
            except ValueError:
--- a/litellm/llms/azure/responses/transformation.py
+++ b/litellm/llms/azure/responses/transformation.py
@ -0,0 +1,94 @@
+from typing import TYPE_CHECKING, Any, Optional, cast
+
+import httpx
+
+import litellm
+from litellm.llms.openai.responses.transformation import OpenAIResponsesAPIConfig
+from litellm.secret_managers.main import get_secret_str
+from litellm.types.llms.openai import *
+from litellm.utils import _add_path_to_api_base
+
+if TYPE_CHECKING:
+    from litellm.litellm_core_utils.litellm_logging import Logging as _LiteLLMLoggingObj
+
+    LiteLLMLoggingObj = _LiteLLMLoggingObj
+else:
+    LiteLLMLoggingObj = Any
+
+
+class AzureOpenAIResponsesAPIConfig(OpenAIResponsesAPIConfig):
+    def validate_environment(
+        self,
+        headers: dict,
+        model: str,
+        api_key: Optional[str] = None,
+    ) -> dict:
+        api_key = (
+            api_key
+            or litellm.api_key
+            or litellm.azure_key
+            or get_secret_str("AZURE_OPENAI_API_KEY")
+            or get_secret_str("AZURE_API_KEY")
+        )
+
+        headers.update(
+            {
+                "Authorization": f"Bearer {api_key}",
+            }
+        )
+        return headers
+
+    def get_complete_url(
+        self,
+        api_base: Optional[str],
+        api_key: Optional[str],
+        model: str,
+        optional_params: dict,
+        litellm_params: dict,
+        stream: Optional[bool] = None,
+    ) -> str:
+        """
+        Constructs a complete URL for the API request.
+
+        Args:
+        - api_base: Base URL, e.g.,
+            "https://litellm8397336933.openai.azure.com"
+            OR
+            "https://litellm8397336933.openai.azure.com/openai/responses?api-version=2024-05-01-preview"
+        - model: Model name.
+        - optional_params: Additional query parameters, including "api_version".
+        - stream: If streaming is required (optional).
+
+        Returns:
+        - A complete URL string, e.g.,
+        "https://litellm8397336933.openai.azure.com/openai/responses?api-version=2024-05-01-preview"
+        """
+        api_base = api_base or litellm.api_base or get_secret_str("AZURE_API_BASE")
+        if api_base is None:
+            raise ValueError(
+                f"api_base is required for Azure AI Studio. Please set the api_base parameter. Passed `api_base={api_base}`"
+            )
+        original_url = httpx.URL(api_base)
+
+        # Extract api_version or use default
+        api_version = cast(Optional[str], litellm_params.get("api_version"))
+
+        # Create a new dictionary with existing params
+        query_params = dict(original_url.params)
+
+        # Add api_version if needed
+        if "api-version" not in query_params and api_version:
+            query_params["api-version"] = api_version
+
+        # Add the path to the base URL
+        if "/openai/responses" not in api_base:
+            new_url = _add_path_to_api_base(
+                api_base=api_base, ending_path="/openai/responses"
+            )
+        else:
+            new_url = api_base
+
+        # Use the new query_params dictionary
+        final_url = httpx.URL(new_url).copy_with(params=query_params)
+
+        return str(final_url)
--- a/litellm/llms/azure_ai/chat/transformation.py
+++ b/litellm/llms/azure_ai/chat/transformation.py
@ -1,3 +1,4 @@
+import enum
 from typing import Any, List, Optional, Tuple, cast
 from urllib.parse import urlparse

@ -19,6 +20,10 @@ from litellm.types.utils import ModelResponse, ProviderField
 from litellm.utils import _add_path_to_api_base, supports_tool_choice


+class AzureFoundryErrorStrings(str, enum.Enum):
+    SET_EXTRA_PARAMETERS_TO_PASS_THROUGH = "Set extra-parameters to 'pass-through'"
+
+
 class AzureAIStudioConfig(OpenAIConfig):
    def get_supported_openai_params(self, model: str) -> List:
        model_supports_tool_choice = True  # azure ai supports this by default
@ -240,12 +245,18 @@ class AzureAIStudioConfig(OpenAIConfig):
    ) -> bool:
        should_drop_params = litellm_params.get("drop_params") or litellm.drop_params
        error_text = e.response.text
+
        if should_drop_params and "Extra inputs are not permitted" in error_text:
            return True
        elif (
            "unknown field: parameter index is not a valid field" in error_text
        ):  # remove index from tool calls
            return True
+        elif (
+            AzureFoundryErrorStrings.SET_EXTRA_PARAMETERS_TO_PASS_THROUGH.value
+            in error_text
+        ):  # remove extra-parameters from tool calls
+            return True
        return super().should_retry_llm_api_inside_llm_translation_on_http_error(
            e=e, litellm_params=litellm_params
        )
@ -265,5 +276,46 @@ class AzureAIStudioConfig(OpenAIConfig):
            litellm.remove_index_from_tool_calls(
                messages=_messages,
            )
+        elif (
+            AzureFoundryErrorStrings.SET_EXTRA_PARAMETERS_TO_PASS_THROUGH.value
+            in e.response.text
+        ):
+            request_data = self._drop_extra_params_from_request_data(
+                request_data, e.response.text
+            )
        data = drop_params_from_unprocessable_entity_error(e=e, data=request_data)
        return data
+
+    def _drop_extra_params_from_request_data(
+        self, request_data: dict, error_text: str
+    ) -> dict:
+        params_to_drop = self._extract_params_to_drop_from_error_text(error_text)
+        if params_to_drop:
+            for param in params_to_drop:
+                if param in request_data:
+                    request_data.pop(param, None)
+        return request_data
+
+    def _extract_params_to_drop_from_error_text(
+        self, error_text: str
+    ) -> Optional[List[str]]:
+        """
+        Error text looks like this"
+            "Extra parameters ['stream_options', 'extra-parameters'] are not allowed when extra-parameters is not set or set to be 'error'.
+        """
+        import re
+
+        # Extract parameters within square brackets
+        match = re.search(r"\[(.*?)\]", error_text)
+        if not match:
+            return []
+
+        # Parse the extracted string into a list of parameter names
+        params_str = match.group(1)
+        params = []
+        for param in params_str.split(","):
+            # Clean up the parameter name (remove quotes, spaces)
+            clean_param = param.strip().strip("'").strip('"')
+            if clean_param:
+                params.append(clean_param)
+        return params
--- a/litellm/llms/base_llm/base_model_iterator.py
+++ b/litellm/llms/base_llm/base_model_iterator.py
@ -1,9 +1,16 @@
 import json
 from abc import abstractmethod
-from typing import Optional, Union
+from typing import List, Optional, Union, cast

 import litellm
-from litellm.types.utils import GenericStreamingChunk, ModelResponseStream
+from litellm.types.utils import (
+    Choices,
+    Delta,
+    GenericStreamingChunk,
+    ModelResponse,
+    ModelResponseStream,
+    StreamingChoices,
+)


 class BaseModelResponseIterator:
@ -121,6 +128,59 @@ class BaseModelResponseIterator:
            raise RuntimeError(f"Error parsing chunk: {e},\nReceived chunk: {chunk}")


+class MockResponseIterator:  # for returning ai21 streaming responses
+    def __init__(
+        self, model_response: ModelResponse, json_mode: Optional[bool] = False
+    ):
+        self.model_response = model_response
+        self.json_mode = json_mode
+        self.is_done = False
+
+    # Sync iterator
+    def __iter__(self):
+        return self
+
+    def _chunk_parser(self, chunk_data: ModelResponse) -> ModelResponseStream:
+        try:
+            streaming_choices: List[StreamingChoices] = []
+            for choice in chunk_data.choices:
+                streaming_choices.append(
+                    StreamingChoices(
+                        index=choice.index,
+                        delta=Delta(
+                            **cast(Choices, choice).message.model_dump(),
+                        ),
+                        finish_reason=choice.finish_reason,
+                    )
+                )
+            processed_chunk = ModelResponseStream(
+                id=chunk_data.id,
+                object="chat.completion",
+                created=chunk_data.created,
+                model=chunk_data.model,
+                choices=streaming_choices,
+            )
+            return processed_chunk
+        except Exception as e:
+            raise ValueError(f"Failed to decode chunk: {chunk_data}. Error: {e}")
+
+    def __next__(self):
+        if self.is_done:
+            raise StopIteration
+        self.is_done = True
+        return self._chunk_parser(self.model_response)
+
+    # Async iterator
+    def __aiter__(self):
+        return self
+
+    async def __anext__(self):
+        if self.is_done:
+            raise StopAsyncIteration
+        self.is_done = True
+        return self._chunk_parser(self.model_response)
+
+
 class FakeStreamResponseIterator:
    def __init__(self, model_response, json_mode: Optional[bool] = False):
        self.model_response = model_response
--- a/litellm/llms/base_llm/base_utils.py
+++ b/litellm/llms/base_llm/base_utils.py
@ -44,6 +44,19 @@ class BaseLLMModelInfo(ABC):
    def get_api_base(api_base: Optional[str] = None) -> Optional[str]:
        pass

+    @abstractmethod
+    def validate_environment(
+        self,
+        headers: dict,
+        model: str,
+        messages: List[AllMessageValues],
+        optional_params: dict,
+        litellm_params: dict,
+        api_key: Optional[str] = None,
+        api_base: Optional[str] = None,
+    ) -> dict:
+        pass
+
    @staticmethod
    @abstractmethod
    def get_base_model(model: str) -> Optional[str]:
--- a/litellm/llms/base_llm/responses/transformation.py
+++ b/litellm/llms/base_llm/responses/transformation.py
@ -73,7 +73,10 @@ class BaseResponsesAPIConfig(ABC):
    def get_complete_url(
        self,
        api_base: Optional[str],
+        api_key: Optional[str],
        model: str,
+        optional_params: dict,
+        litellm_params: dict,
        stream: Optional[bool] = None,
    ) -> str:
        """
--- a/litellm/llms/bedrock/chat/converse_transformation.py
+++ b/litellm/llms/bedrock/chat/converse_transformation.py
@ -22,6 +22,7 @@ from litellm.llms.base_llm.chat.transformation import BaseConfig, BaseLLMExcepti
 from litellm.types.llms.bedrock import *
 from litellm.types.llms.openai import (
    AllMessageValues,
+    ChatCompletionRedactedThinkingBlock,
    ChatCompletionResponseMessage,
    ChatCompletionSystemMessage,
    ChatCompletionThinkingBlock,
@ -375,25 +376,27 @@ class AmazonConverseConfig(BaseConfig):
        system_content_blocks: List[SystemContentBlock] = []
        for idx, message in enumerate(messages):
            if message["role"] == "system":
-                _system_content_block: Optional[SystemContentBlock] = None
-                _cache_point_block: Optional[SystemContentBlock] = None
-                if isinstance(message["content"], str) and len(message["content"]) > 0:
-                    _system_content_block = SystemContentBlock(text=message["content"])
-                    _cache_point_block = self._get_cache_point_block(
+                system_prompt_indices.append(idx)
+                if isinstance(message["content"], str) and message["content"]:
+                    system_content_blocks.append(
+                        SystemContentBlock(text=message["content"])
+                    )
+                    cache_block = self._get_cache_point_block(
                        message, block_type="system"
                    )
+                    if cache_block:
+                        system_content_blocks.append(cache_block)
                elif isinstance(message["content"], list):
                    for m in message["content"]:
-                        if m.get("type", "") == "text" and len(m["text"]) > 0:
-                            _system_content_block = SystemContentBlock(text=m["text"])
-                            _cache_point_block = self._get_cache_point_block(
+                        if m.get("type") == "text" and m.get("text"):
+                            system_content_blocks.append(
+                                SystemContentBlock(text=m["text"])
+                            )
+                            cache_block = self._get_cache_point_block(
                                m, block_type="system"
                            )
-                if _system_content_block is not None:
-                    system_content_blocks.append(_system_content_block)
-                if _cache_point_block is not None:
-                    system_content_blocks.append(_cache_point_block)
-                system_prompt_indices.append(idx)
+                            if cache_block:
+                                system_content_blocks.append(cache_block)
        if len(system_prompt_indices) > 0:
            for idx in reversed(system_prompt_indices):
                messages.pop(idx)
@ -627,9 +630,11 @@ class AmazonConverseConfig(BaseConfig):

    def _transform_thinking_blocks(
        self, thinking_blocks: List[BedrockConverseReasoningContentBlock]
-    ) -> List[ChatCompletionThinkingBlock]:
+    ) -> List[Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]]:
        """Return a consistent format for thinking blocks between Anthropic and Bedrock."""
-        thinking_blocks_list: List[ChatCompletionThinkingBlock] = []
+        thinking_blocks_list: List[
+            Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]
+        ] = []
        for block in thinking_blocks:
            if "reasoningText" in block:
                _thinking_block = ChatCompletionThinkingBlock(type="thinking")
@ -640,6 +645,11 @@ class AmazonConverseConfig(BaseConfig):
                if _signature is not None:
                    _thinking_block["signature"] = _signature
                thinking_blocks_list.append(_thinking_block)
+            elif "redactedContent" in block:
+                _redacted_block = ChatCompletionRedactedThinkingBlock(
+                    type="redacted_thinking", data=block["redactedContent"]
+                )
+                thinking_blocks_list.append(_redacted_block)
        return thinking_blocks_list

    def _transform_usage(self, usage: ConverseTokenUsageBlock) -> Usage:
--- a/Show more
+++ b/Show more