Merge branch 'BerriAI:main' into main

2025-04-25 18:54:30 +00:00 · 2025-03-13 19:37:22 -04:00 · 2025-03-13 19:37:22 -04:00 · e01d12b878
commit e01d12b878
parent ce6c4b540e 52972b0ee0
317 changed files with 15980 additions and 5207 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -49,7 +49,7 @@ jobs:
            pip install opentelemetry-api==1.25.0
            pip install opentelemetry-sdk==1.25.0
            pip install opentelemetry-exporter-otlp==1.25.0
-            pip install openai==1.54.0
+            pip install openai==1.66.1
            pip install prisma==0.11.0
            pip install "detect_secrets==1.5.0"
            pip install "httpx==0.24.1"
@ -168,7 +168,7 @@ jobs:
            pip install opentelemetry-api==1.25.0
            pip install opentelemetry-sdk==1.25.0
            pip install opentelemetry-exporter-otlp==1.25.0
-            pip install openai==1.54.0
+            pip install openai==1.66.1
            pip install prisma==0.11.0
            pip install "detect_secrets==1.5.0"
            pip install "httpx==0.24.1"
@ -267,7 +267,7 @@ jobs:
            pip install opentelemetry-api==1.25.0
            pip install opentelemetry-sdk==1.25.0
            pip install opentelemetry-exporter-otlp==1.25.0
-            pip install openai==1.54.0
+            pip install openai==1.66.1
            pip install prisma==0.11.0
            pip install "detect_secrets==1.5.0"
            pip install "httpx==0.24.1"
@ -511,7 +511,7 @@ jobs:
            pip install opentelemetry-api==1.25.0
            pip install opentelemetry-sdk==1.25.0
            pip install opentelemetry-exporter-otlp==1.25.0
-            pip install openai==1.54.0
+            pip install openai==1.66.1
            pip install prisma==0.11.0
            pip install "detect_secrets==1.5.0"
            pip install "httpx==0.24.1"
@ -678,6 +678,48 @@ jobs:
          paths:
            - llm_translation_coverage.xml
            - llm_translation_coverage
+  llm_responses_api_testing:
+    docker:
+      - image: cimg/python:3.11
+        auth:
+          username: ${DOCKERHUB_USERNAME}
+          password: ${DOCKERHUB_PASSWORD}
+    working_directory: ~/project
+
+    steps:
+      - checkout
+      - run:
+          name: Install Dependencies
+          command: |
+            python -m pip install --upgrade pip
+            python -m pip install -r requirements.txt
+            pip install "pytest==7.3.1"
+            pip install "pytest-retry==1.6.3"
+            pip install "pytest-cov==5.0.0"
+            pip install "pytest-asyncio==0.21.1"
+            pip install "respx==0.21.1"
+      # Run pytest and generate JUnit XML report
+      - run:
+          name: Run tests
+          command: |
+            pwd
+            ls
+            python -m pytest -vv tests/llm_responses_api_testing --cov=litellm --cov-report=xml -x -s -v --junitxml=test-results/junit.xml --durations=5
+          no_output_timeout: 120m
+      - run:
+          name: Rename the coverage files
+          command: |
+            mv coverage.xml llm_responses_api_coverage.xml
+            mv .coverage llm_responses_api_coverage
+
+      # Store test results
+      - store_test_results:
+          path: test-results
+      - persist_to_workspace:
+          root: .
+          paths:
+            - llm_responses_api_coverage.xml
+            - llm_responses_api_coverage
  litellm_mapped_tests:
    docker:
      - image: cimg/python:3.11
@ -1234,7 +1276,7 @@ jobs:
            pip install "aiodynamo==23.10.1"
            pip install "asyncio==3.4.3"
            pip install "PyGithub==1.59.1"
-            pip install "openai==1.54.0 "
+            pip install "openai==1.66.1"
      - run:
          name: Install Grype
          command: |
@ -1309,7 +1351,7 @@ jobs:
          command: |
            pwd
            ls
-            python -m pytest -s -vv tests/*.py -x --junitxml=test-results/junit.xml --durations=5 --ignore=tests/otel_tests --ignore=tests/pass_through_tests --ignore=tests/proxy_admin_ui_tests --ignore=tests/load_tests --ignore=tests/llm_translation --ignore=tests/image_gen_tests --ignore=tests/pass_through_unit_tests
+            python -m pytest -s -vv tests/*.py -x --junitxml=test-results/junit.xml --durations=5 --ignore=tests/otel_tests --ignore=tests/pass_through_tests --ignore=tests/proxy_admin_ui_tests --ignore=tests/load_tests --ignore=tests/llm_translation --ignore=tests/llm_responses_api_testing --ignore=tests/image_gen_tests --ignore=tests/pass_through_unit_tests
          no_output_timeout: 120m

      # Store test results
@ -1370,7 +1412,7 @@ jobs:
            pip install "aiodynamo==23.10.1"
            pip install "asyncio==3.4.3"
            pip install "PyGithub==1.59.1"
-            pip install "openai==1.54.0 "
+            pip install "openai==1.66.1"
            # Run pytest and generate JUnit XML report
      - run:
          name: Build Docker image
@ -1492,7 +1534,7 @@ jobs:
            pip install "aiodynamo==23.10.1"
            pip install "asyncio==3.4.3"
            pip install "PyGithub==1.59.1"
-            pip install "openai==1.54.0 "
+            pip install "openai==1.66.1"
      - run:
          name: Build Docker image
          command: docker build -t my-app:latest -f ./docker/Dockerfile.database .
@ -1921,7 +1963,7 @@ jobs:
            pip install "pytest-asyncio==0.21.1"
            pip install "google-cloud-aiplatform==1.43.0"
            pip install aiohttp
-            pip install "openai==1.54.0 "
+            pip install "openai==1.66.1"
            pip install "assemblyai==0.37.0"
            python -m pip install --upgrade pip
            pip install "pydantic==2.7.1"
@ -1935,12 +1977,12 @@ jobs:
            pip install prisma
            pip install fastapi
            pip install jsonschema
-            pip install "httpx==0.24.1"
+            pip install "httpx==0.27.0"
            pip install "anyio==3.7.1"
            pip install "asyncio==3.4.3"
            pip install "PyGithub==1.59.1"
            pip install "google-cloud-aiplatform==1.59.0"
-            pip install "anthropic==0.21.3"
+            pip install "anthropic==0.49.0"
      # Run pytest and generate JUnit XML report
      - run:
          name: Build Docker image
@ -2068,7 +2110,7 @@ jobs:
            python -m venv venv
            . venv/bin/activate
            pip install coverage
-            coverage combine llm_translation_coverage logging_coverage litellm_router_coverage local_testing_coverage litellm_assistants_api_coverage auth_ui_unit_tests_coverage langfuse_coverage caching_coverage litellm_proxy_unit_tests_coverage image_gen_coverage pass_through_unit_tests_coverage batches_coverage litellm_proxy_security_tests_coverage
+            coverage combine llm_translation_coverage llm_responses_api_coverage logging_coverage litellm_router_coverage local_testing_coverage litellm_assistants_api_coverage auth_ui_unit_tests_coverage langfuse_coverage caching_coverage litellm_proxy_unit_tests_coverage image_gen_coverage pass_through_unit_tests_coverage batches_coverage litellm_proxy_security_tests_coverage
            coverage xml
      - codecov/upload:
          file: ./coverage.xml
@ -2197,7 +2239,7 @@ jobs:
            pip install "pytest-retry==1.6.3"
            pip install "pytest-asyncio==0.21.1"
            pip install aiohttp
-            pip install "openai==1.54.0 "
+            pip install "openai==1.66.1"
            python -m pip install --upgrade pip
            pip install "pydantic==2.7.1"
            pip install "pytest==7.3.1"
@ -2429,6 +2471,12 @@ workflows:
              only:
                - main
                - /litellm_.*/
+      - llm_responses_api_testing:
+          filters:
+            branches:
+              only:
+                - main
+                - /litellm_.*/
      - litellm_mapped_tests:
          filters:
            branches:
@ -2468,6 +2516,7 @@ workflows:
      - upload-coverage:
          requires:
            - llm_translation_testing
+            - llm_responses_api_testing
            - litellm_mapped_tests
            - batches_testing
            - litellm_utils_testing
@ -2526,6 +2575,7 @@ workflows:
            - load_testing
            - test_bad_database_url
            - llm_translation_testing
+            - llm_responses_api_testing
            - litellm_mapped_tests
            - batches_testing
            - litellm_utils_testing
--- a/.circleci/requirements.txt
+++ b/.circleci/requirements.txt
@ -1,5 +1,5 @@
 # used by CI/CD testing
-openai==1.54.0 
+openai==1.66.1
 python-dotenv
 tiktoken
 importlib_metadata
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@ -6,6 +6,16 @@

 <!-- e.g. "Fixes #000" -->

+## Pre-Submission checklist
+
+**Please complete all items before asking a LiteLLM maintainer to review your PR**
+
+- [ ] I have Added testing in the `tests/litellm/` directory, **Adding at least 1 test is a hard requirement** - [see details](https://docs.litellm.ai/docs/extras/contributing_code)
+- [ ] I have added a screenshot of my new test passing locally 
+- [ ] My PR passes all unit tests on (`make test-unit`)[https://docs.litellm.ai/docs/extras/contributing_code]
+- [ ] My PR's scope is as isolated as possible, it only solves 1 specific problem
+
+
 ## Type

 <!-- Select the type of Pull Request -->
@ -20,10 +30,4 @@

 ## Changes

-<!-- List of changes -->
-
-## [REQUIRED] Testing - Attach a screenshot of any new tests passing locally
-If UI changes, send a screenshot/GIF of working UI fixes
-
-<!-- Test procedure -->

--- a/.github/workflows/helm_unit_test.yml
+++ b/.github/workflows/helm_unit_test.yml
@ -0,0 +1,27 @@
+name: Helm unit test
+
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+
+jobs:
+  unit-test:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+
+      - name: Set up Helm 3.11.1
+        uses: azure/setup-helm@v1
+        with:
+          version: '3.11.1'
+
+      - name: Install Helm Unit Test Plugin
+        run: |
+          helm plugin install https://github.com/helm-unittest/helm-unittest --version v0.4.4
+
+      - name: Run unit tests
+        run:
+          helm unittest -f 'tests/*.yaml' deploy/charts/litellm-helm
--- a/21
+++ b/21
@ -0,0 +1,21 @@
+# LiteLLM Makefile
+# Simple Makefile for running tests and basic development tasks
+
+.PHONY: help test test-unit test-integration
+
+# Default target
+help:
+	@echo "Available commands:"
+	@echo "  make test               - Run all tests"
+	@echo "  make test-unit          - Run unit tests"
+	@echo "  make test-integration   - Run integration tests"
+
+# Testing
+test:
+	poetry run pytest tests/
+
+test-unit:
+	poetry run pytest tests/litellm/
+
+test-integration:
+	poetry run pytest tests/ -k "not litellm" 
--- a/README.md
+++ b/README.md
@ -340,71 +340,7 @@ curl 'http://0.0.0.0:4000/key/generate' \

 ## Contributing

-To contribute: Clone the repo locally -> Make a change -> Submit a PR with the change.
-
-Here's how to modify the repo locally:
-
-Step 1: Clone the repo
-
-```
-git clone https://github.com/BerriAI/litellm.git
-```
-
-Step 2: Install dependencies:
-
-```
-pip install -r requirements.txt
-```
-
-Step 3: Test your change:
-
-a. Add a pytest test within `tests/litellm/`
-
-This folder follows the same directory structure as `litellm/`.
-
-If a corresponding test file does not exist, create one.
-
-b. Run the test
-
-```
-cd tests/litellm # pwd: Documents/litellm/litellm/tests/litellm
-pytest /path/to/test_file.py
-```
-
-Step 4: Submit a PR with your changes! 🚀
-
- push your fork to your GitHub repo
- submit a PR from there
-
-### Building LiteLLM Docker Image 
-
-Follow these instructions if you want to build / run the LiteLLM Docker Image yourself.
-
-Step 1: Clone the repo
-
-```
-git clone https://github.com/BerriAI/litellm.git
-```
-
-Step 2: Build the Docker Image
-
-Build using Dockerfile.non_root
-```
-docker build -f docker/Dockerfile.non_root -t litellm_test_image .
-```
-
-Step 3: Run the Docker Image
-
-Make sure config.yaml is present in the root directory. This is your litellm proxy config file.
-```
-docker run \
-    -v $(pwd)/proxy_config.yaml:/app/config.yaml \
-    -e DATABASE_URL="postgresql://xxxxxxxx" \
-    -e LITELLM_MASTER_KEY="sk-1234" \
-    -p 4000:4000 \
-    litellm_test_image \
-    --config /app/config.yaml --detailed_debug
-```
+Interested in contributing? Contributions to LiteLLM Python SDK, Proxy Server, and contributing LLM integrations are both accepted and highly encouraged! [See our Contribution Guide for more details](https://docs.litellm.ai/docs/extras/contributing_code)

 # Enterprise
 For companies that need better security, user management and professional support
--- a/deploy/charts/litellm-helm/tests/deployment_tests.yaml
+++ b/deploy/charts/litellm-helm/tests/deployment_tests.yaml
@ -0,0 +1,54 @@
+suite: test deployment
+templates:
+  - deployment.yaml
+  - configmap-litellm.yaml
+tests:
+  - it: should work
+    template: deployment.yaml
+    set:
+      image.tag: test
+    asserts:
+      - isKind:
+          of: Deployment
+      - matchRegex:
+          path: metadata.name
+          pattern: -litellm$
+      - equal:
+          path: spec.template.spec.containers[0].image
+          value: ghcr.io/berriai/litellm-database:test
+  - it: should work with tolerations
+    template: deployment.yaml
+    set:
+      tolerations:
+        - key: node-role.kubernetes.io/master
+          operator: Exists
+          effect: NoSchedule
+    asserts:
+      - equal:
+          path: spec.template.spec.tolerations[0].key
+          value: node-role.kubernetes.io/master
+      - equal:
+          path: spec.template.spec.tolerations[0].operator
+          value: Exists
+  - it: should work with affinity
+    template: deployment.yaml
+    set:
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: topology.kubernetes.io/zone
+                operator: In
+                values:
+                - antarctica-east1
+    asserts:
+      - equal:
+          path: spec.template.spec.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].key
+          value: topology.kubernetes.io/zone
+      - equal:
+          path: spec.template.spec.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].operator
+          value: In
+      - equal:
+          path: spec.template.spec.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].values[0]
+          value: antarctica-east1
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -20,10 +20,18 @@ services:
        STORE_MODEL_IN_DB: "True" # allows adding models to proxy via UI
    env_file:
      - .env # Load local .env file
+    depends_on:
+      - db  # Indicates that this service depends on the 'db' service, ensuring 'db' starts first
+    healthcheck:  # Defines the health check configuration for the container
+      test: [ "CMD", "curl", "-f", "http://localhost:4000/health/liveliness || exit 1" ]  # Command to execute for health check
+      interval: 30s  # Perform health check every 30 seconds
+      timeout: 10s   # Health check command times out after 10 seconds
+      retries: 3     # Retry up to 3 times if health check fails
+      start_period: 40s  # Wait 40 seconds after container start before beginning health checks

 
  db:
-    image: postgres
+    image: postgres:16
    restart: always
    environment:
      POSTGRES_DB: litellm
@ -31,6 +39,8 @@ services:
      POSTGRES_PASSWORD: dbpassword9090
    ports:
      - "5432:5432"
+    volumes:
+      - postgres_data:/var/lib/postgresql/data  # Persists Postgres data across container restarts
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -d litellm -U llmproxy"]
      interval: 1s
@ -53,6 +63,8 @@ services:
 volumes:
  prometheus_data:
    driver: local
+  postgres_data:
+    name: litellm_postgres_data  # Named volume for Postgres data persistence


 # ...rest of your docker-compose config if any
--- a/docs/my-website/docs/anthropic_unified.md
+++ b/docs/my-website/docs/anthropic_unified.md
@ -0,0 +1,92 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# /v1/messages [BETA] 
+
+LiteLLM provides a BETA endpoint in the spec of Anthropic's `/v1/messages` endpoint. 
+
+This currently just supports the Anthropic API. 
+
+| Feature | Supported | Notes | 
+|-------|-------|-------|
+| Cost Tracking | ✅ |  |
+| Logging | ✅ | works across all integrations |
+| End-user Tracking | ✅ | |
+| Streaming | ✅ | |
+| Fallbacks | ✅ | between anthropic models |
+| Loadbalancing | ✅ | between anthropic models |
+
+Planned improvement:
+- Vertex AI Anthropic support
+- Bedrock Anthropic support
+
+## Usage 
+
+<Tabs>
+<TabItem label="PROXY" value="proxy">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+    - model_name: anthropic-claude
+      litellm_params:
+        model: claude-3-7-sonnet-latest
+```
+
+2. Start proxy 
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```bash
+curl -L -X POST 'http://0.0.0.0:4000/v1/messages' \
+-H 'content-type: application/json' \
+-H 'x-api-key: $LITELLM_API_KEY' \
+-H 'anthropic-version: 2023-06-01' \
+-d '{
+  "model": "anthropic-claude",
+  "messages": [
+    {
+      "role": "user",
+      "content": [
+        {
+          "type": "text",
+          "text": "List 5 important events in the XIX century"
+        }
+      ]
+    }
+  ],
+  "max_tokens": 4096
+}'
+```
+</TabItem>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm.llms.anthropic.experimental_pass_through.messages.handler import anthropic_messages
+import asyncio 
+import os 
+
+# set env 
+os.environ["ANTHROPIC_API_KEY"] = "my-api-key"
+
+messages = [{"role": "user", "content": "Hello, can you tell me a short joke?"}]
+
+# Call the handler
+async def call(): 
+    response = await anthropic_messages(
+        messages=messages,
+        api_key=api_key,
+        model="claude-3-haiku-20240307",
+        max_tokens=100,
+    )
+
+asyncio.run(call())
+```
+
+</TabItem>
+</Tabs>
--- a/docs/my-website/docs/assistants.md
+++ b/docs/my-website/docs/assistants.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# Assistants API 
+# /assistants

 Covers Threads, Messages, Assistants. 

--- a/docs/my-website/docs/batches.md
+++ b/docs/my-website/docs/batches.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# [BETA] Batches API
+# /batches

 Covers Batches, Files

--- a/docs/my-website/docs/completion/vision.md
+++ b/docs/my-website/docs/completion/vision.md
@ -189,4 +189,138 @@ Expected Response
 ```

 </TabItem>
-</Tabs>
+</Tabs>
+
+
+## Explicitly specify image type 
+
+If you have images without a mime-type, or if litellm is incorrectly inferring the mime type of your image (e.g. calling `gs://` url's with vertex ai), you can set this explicity via the `format` param. 
+
+```python
+"image_url": {
+  "url": "gs://my-gs-image",
+  "format": "image/jpeg"
+}
+```
+
+LiteLLM will use this for any API endpoint, which supports specifying mime-type (e.g. anthropic/bedrock/vertex ai). 
+
+For others (e.g. openai), it will be ignored. 
+
+<Tabs>
+<TabItem label="SDK" value="sdk">
+
+```python
+import os 
+from litellm import completion
+
+os.environ["ANTHROPIC_API_KEY"] = "your-api-key"
+
+# openai call
+response = completion(
+    model = "claude-3-7-sonnet-latest", 
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                            {
+                                "type": "text",
+                                "text": "What’s in this image?"
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                  "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+                                  "format": "image/jpeg"
+                                }
+                            }
+                        ]
+        }
+    ],
+)
+
+```
+
+</TabItem>
+<TabItem label="PROXY" value="proxy">
+
+1. Define vision models on config.yaml
+
+```yaml
+model_list:
+  - model_name: gpt-4-vision-preview # OpenAI gpt-4-vision-preview
+    litellm_params:
+      model: openai/gpt-4-vision-preview
+      api_key: os.environ/OPENAI_API_KEY
+  - model_name: llava-hf          # Custom OpenAI compatible model
+    litellm_params:
+      model: openai/llava-hf/llava-v1.6-vicuna-7b-hf
+      api_base: http://localhost:8000
+      api_key: fake-key
+    model_info:
+      supports_vision: True        # set supports_vision to True so /model/info returns this attribute as True
+
+```
+
+2. Run proxy server
+
+```bash
+litellm --config config.yaml
+```
+
+3. Test it using the OpenAI Python SDK
+
+
+```python
+import os 
+from openai import OpenAI
+
+client = OpenAI(
+    api_key="sk-1234", # your litellm proxy api key
+)
+
+response = client.chat.completions.create(
+    model = "gpt-4-vision-preview",  # use model="llava-hf" to test your custom OpenAI endpoint
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                            {
+                                "type": "text",
+                                "text": "What’s in this image?"
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+                                "format": "image/jpeg"
+                                }
+                            }
+                        ]
+        }
+    ],
+)
+
+```
+
+
+
+
+</TabItem>
+</Tabs>
+
+
+
+## Spec 
+
+```
+"image_url": str
+
+OR 
+
+"image_url": {
+  "url": "url OR base64 encoded str",
+  "detail": "openai-only param", 
+  "format": "specify mime-type of image"
+}
+```
--- a/docs/my-website/docs/embedding/supported_embedding.md
+++ b/docs/my-website/docs/embedding/supported_embedding.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# Embeddings
+# /embeddings

 ## Quick Start
 ```python
--- a/docs/my-website/docs/extras/contributing_code.md
+++ b/docs/my-website/docs/extras/contributing_code.md
@ -0,0 +1,96 @@
+# Contributing Code
+
+## **Checklist before submitting a PR**
+
+Here are the core requirements for any PR submitted to LiteLLM
+
+
+- [ ] Add testing, **Adding at least 1 test is a hard requirement** - [see details](#2-adding-testing-to-your-pr)
+- [ ] Ensure your PR passes the following tests:
+    - [ ] [Unit Tests](#3-running-unit-tests)
+    - [ ] Formatting / Linting Tests
+- [ ] Keep scope as isolated as possible. As a general rule, your changes should address 1 specific problem at a time
+
+
+
+## Quick start
+
+## 1. Setup your local dev environment
+
+
+Here's how to modify the repo locally:
+
+Step 1: Clone the repo
+
+```shell
+git clone https://github.com/BerriAI/litellm.git
+```
+
+Step 2: Install dev dependencies:
+
+```shell
+poetry install --with dev --extras proxy
+```
+
+That's it, your local dev environment is ready!
+
+## 2. Adding Testing to your PR
+
+- Add your test to the [`tests/litellm/` directory](https://github.com/BerriAI/litellm/tree/main/tests/litellm)
+
+- This directory 1:1 maps the the `litellm/` directory, and can only contain mocked tests.
+- Do not add real llm api calls to this directory.
+
+### 2.1 File Naming Convention for `tests/litellm/`
+
+The `tests/litellm/` directory follows the same directory structure as `litellm/`.
+
+- `litellm/proxy/test_caching_routes.py` maps to `litellm/proxy/caching_routes.py`
+- `test_{filename}.py` maps to `litellm/{filename}.py`
+
+## 3. Running Unit Tests
+
+run the following command on the root of the litellm directory
+
+```shell
+make test-unit
+```
+
+## 4. Submit a PR with your changes!
+
+- push your fork to your GitHub repo
+- submit a PR from there
+
+
+## Advanced
+### Building LiteLLM Docker Image 
+
+Some people might want to build the LiteLLM docker image themselves. Follow these instructions if you want to build / run the LiteLLM Docker Image yourself.
+
+Step 1: Clone the repo
+
+```shell
+git clone https://github.com/BerriAI/litellm.git
+```
+
+Step 2: Build the Docker Image
+
+Build using Dockerfile.non_root
+
+```shell
+docker build -f docker/Dockerfile.non_root -t litellm_test_image .
+```
+
+Step 3: Run the Docker Image
+
+Make sure config.yaml is present in the root directory. This is your litellm proxy config file.
+
+```shell
+docker run \
+    -v $(pwd)/proxy_config.yaml:/app/config.yaml \
+    -e DATABASE_URL="postgresql://xxxxxxxx" \
+    -e LITELLM_MASTER_KEY="sk-1234" \
+    -p 4000:4000 \
+    litellm_test_image \
+    --config /app/config.yaml --detailed_debug
+```
--- a/docs/my-website/docs/files_endpoints.md
+++ b/docs/my-website/docs/files_endpoints.md
@ -2,7 +2,7 @@
 import TabItem from '@theme/TabItem';
 import Tabs from '@theme/Tabs';

-# Files API
+# /files

 Files are used to upload documents that can be used with features like Assistants, Fine-tuning, and Batch API.

--- a/docs/my-website/docs/fine_tuning.md
+++ b/docs/my-website/docs/fine_tuning.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# [Beta] Fine-tuning API
+# /fine_tuning


 :::info
--- a/docs/my-website/docs/moderation.md
+++ b/docs/my-website/docs/moderation.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# Moderation
+# /moderations


 ### Usage
--- a/docs/my-website/docs/observability/athina_integration.md
+++ b/docs/my-website/docs/observability/athina_integration.md
@ -78,6 +78,9 @@ Following are the allowed fields in metadata, their types, and their description
 * `context: Optional[Union[dict, str]]` - This is the context used as information for the prompt. For RAG applications, this is the "retrieved" data. You may log context as a string or as an object (dictionary).
 * `expected_response: Optional[str]` - This is the reference response to compare against for evaluation purposes. This is useful for segmenting inference calls by expected response.
 * `user_query: Optional[str]` - This is the user's query. For conversational applications, this is the user's last message.
+* `tags: Optional[list]` - This is a list of tags. This is useful for segmenting inference calls by tags.
+* `user_feedback: Optional[str]` - The end user’s feedback.
+* `model_options: Optional[dict]` - This is a dictionary of model options. This is useful for getting insights into how model behavior affects your end users.
 * `custom_attributes: Optional[dict]` - This is a dictionary of custom attributes. This is useful for additional information about the inference.

 ## Using a self hosted deployment of Athina
--- a/docs/my-website/docs/projects/PDL.md
+++ b/docs/my-website/docs/projects/PDL.md
@ -0,0 +1,5 @@
+PDL - A YAML-based approach to prompt programming
+
+Github: https://github.com/IBM/prompt-declaration-language
+
+PDL is a declarative approach to prompt programming, helping users to accumulate messages implicitly, with support for model chaining and tool use.
--- a/docs/my-website/docs/projects/pgai.md
+++ b/docs/my-website/docs/projects/pgai.md
@ -0,0 +1,9 @@
+# pgai
+
+[pgai](https://github.com/timescale/pgai) is a suite of tools to develop RAG, semantic search, and other AI applications more easily with PostgreSQL.
+
+If you don't know what pgai is yet check out the [README](https://github.com/timescale/pgai)!
+
+If you're already familiar with pgai, you can find litellm specific docs here:
+- Litellm for [model calling](https://github.com/timescale/pgai/blob/main/docs/model_calling/litellm.md) in pgai
+- Use the [litellm provider](https://github.com/timescale/pgai/blob/main/docs/vectorizer/api-reference.md#aiembedding_litellm) to automatically create embeddings for your data via the pgai vectorizer.
--- a/docs/my-website/docs/providers/bedrock.md
+++ b/docs/my-website/docs/providers/bedrock.md
@ -63,9 +63,9 @@ model_list:
  - model_name: bedrock-claude-v1
    litellm_params:
      model: bedrock/anthropic.claude-instant-v1
-      aws_access_key_id: os.environ/CUSTOM_AWS_ACCESS_KEY_ID
-      aws_secret_access_key: os.environ/CUSTOM_AWS_SECRET_ACCESS_KEY
-      aws_region_name: os.environ/CUSTOM_AWS_REGION_NAME
+      aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID
+      aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY
+      aws_region_name: os.environ/AWS_REGION_NAME
 ```

 All possible auth params: 
@ -286,9 +286,12 @@ print(response)
 </TabItem>
 </Tabs>

-## Usage - Function Calling 
+## Usage - Function Calling / Tool calling

-LiteLLM uses Bedrock's Converse API for making tool calls
+LiteLLM supports tool calling via Bedrock's Converse and Invoke API's.
+
+<Tabs>
+<TabItem value="sdk" label="SDK">

 ```python
 from litellm import completion
@ -333,6 +336,69 @@ assert isinstance(
    response.choices[0].message.tool_calls[0].function.arguments, str
 )
 ```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+  - model_name: bedrock-claude-3-7
+    litellm_params:
+      model: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0 # for bedrock invoke, specify `bedrock/invoke/<model>`
+```
+
+2. Start proxy 
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```bash
+curl http://0.0.0.0:4000/v1/chat/completions \
+-H "Content-Type: application/json" \
+-H "Authorization: Bearer $LITELLM_API_KEY" \
+-d '{
+  "model": "bedrock-claude-3-7",
+  "messages": [
+    {
+      "role": "user",
+      "content": "What'\''s the weather like in Boston today?"
+    }
+  ],
+  "tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "get_current_weather",
+        "description": "Get the current weather in a given location",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "location": {
+              "type": "string",
+              "description": "The city and state, e.g. San Francisco, CA"
+            },
+            "unit": {
+              "type": "string",
+              "enum": ["celsius", "fahrenheit"]
+            }
+          },
+          "required": ["location"]
+        }
+      }
+    }
+  ],
+  "tool_choice": "auto"
+}'
+
+```
+
+
+</TabItem>
+</Tabs>


 ## Usage - Vision 
@ -390,9 +456,9 @@ Returns 2 new fields in `message` and `delta` object:
 Each object has the following fields:
 - `type` - Literal["thinking"] - The type of thinking block
 - `thinking` - string - The thinking of the response. Also returned in `reasoning_content`
- `signature_delta` - string - A base64 encoded string, returned by Anthropic.
+- `signature` - string - A base64 encoded string, returned by Anthropic.

-The `signature_delta` is required by Anthropic on subsequent calls, if 'thinking' content is passed in (only required to use `thinking` with tool calling). [Learn more](https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#understanding-thinking-blocks)
+The `signature` is required by Anthropic on subsequent calls, if 'thinking' content is passed in (only required to use `thinking` with tool calling). [Learn more](https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#understanding-thinking-blocks)

 <Tabs>
 <TabItem value="sdk" label="SDK">
@ -475,7 +541,7 @@ Same as [Anthropic API response](../providers/anthropic#usage---thinking--reason
                    {
                        "type": "thinking",
                        "thinking": "The capital of France is Paris. This is a straightforward factual question.",
-                        "signature_delta": "EqoBCkgIARABGAIiQL2UoU0b1OHYi+yCHpBY7U6FQW8/FcoLewocJQPa2HnmLM+NECy50y44F/kD4SULFXi57buI9fAvyBwtyjlOiO0SDE3+r3spdg6PLOo9PBoMma2ku5OTAoR46j9VIjDRlvNmBvff7YW4WI9oU8XagaOBSxLPxElrhyuxppEn7m6bfT40dqBSTDrfiw4FYB4qEPETTI6TA6wtjGAAqmFqKTo="
+                        "signature": "EqoBCkgIARABGAIiQL2UoU0b1OHYi+yCHpBY7U6FQW8/FcoLewocJQPa2HnmLM+NECy50y44F/kD4SULFXi57buI9fAvyBwtyjlOiO0SDE3+r3spdg6PLOo9PBoMma2ku5OTAoR46j9VIjDRlvNmBvff7YW4WI9oU8XagaOBSxLPxElrhyuxppEn7m6bfT40dqBSTDrfiw4FYB4qEPETTI6TA6wtjGAAqmFqKTo="
                    }
                ]
            }
@ -492,6 +558,111 @@ Same as [Anthropic API response](../providers/anthropic#usage---thinking--reason
 ```


+## Usage - Structured Output / JSON mode 
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+import os 
+from pydantic import BaseModel
+
+# set env
+os.environ["AWS_ACCESS_KEY_ID"] = ""
+os.environ["AWS_SECRET_ACCESS_KEY"] = ""
+os.environ["AWS_REGION_NAME"] = ""
+
+class CalendarEvent(BaseModel):
+  name: str
+  date: str
+  participants: list[str]
+
+class EventsList(BaseModel):
+    events: list[CalendarEvent]
+
+response = completion(
+  model="bedrock/anthropic.claude-3-7-sonnet-20250219-v1:0", # specify invoke via `bedrock/invoke/anthropic.claude-3-7-sonnet-20250219-v1:0`
+  response_format=EventsList,
+  messages=[
+    {"role": "system", "content": "You are a helpful assistant designed to output JSON."},
+    {"role": "user", "content": "Who won the world series in 2020?"}
+  ],
+)
+print(response.choices[0].message.content)
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+  - model_name: bedrock-claude-3-7
+    litellm_params:
+      model: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0 # specify invoke via `bedrock/invoke/<model_name>` 
+      aws_access_key_id: os.environ/CUSTOM_AWS_ACCESS_KEY_ID
+      aws_secret_access_key: os.environ/CUSTOM_AWS_SECRET_ACCESS_KEY
+      aws_region_name: os.environ/CUSTOM_AWS_REGION_NAME
+```
+
+2. Start proxy 
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it!
+
+```bash
+curl http://0.0.0.0:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $LITELLM_KEY" \
+  -d '{
+    "model": "bedrock-claude-3-7",
+    "messages": [
+      {
+        "role": "system",
+        "content": "You are a helpful assistant designed to output JSON."
+      },
+      {
+        "role": "user",
+        "content": "Who won the worlde series in 2020?"
+      }
+    ],
+    "response_format": {
+      "type": "json_schema",
+      "json_schema": {
+        "name": "math_reasoning",
+        "description": "reason about maths",
+        "schema": {
+          "type": "object",
+          "properties": {
+            "steps": {
+              "type": "array",
+              "items": {
+                "type": "object",
+                "properties": {
+                  "explanation": { "type": "string" },
+                  "output": { "type": "string" }
+                },
+                "required": ["explanation", "output"],
+                "additionalProperties": false
+              }
+            },
+            "final_answer": { "type": "string" }
+          },
+          "required": ["steps", "final_answer"],
+          "additionalProperties": false
+        },
+        "strict": true
+      }
+    }
+  }'
+```
+</TabItem>
+</Tabs>
+
 ## Usage - Bedrock Guardrails

 Example of using [Bedrock Guardrails with LiteLLM](https://docs.aws.amazon.com/bedrock/latest/userguide/guardrails-use-converse-api.html)
@ -1621,10 +1792,14 @@ print(response)
 ### Advanced - [Pass model/provider-specific Params](https://docs.litellm.ai/docs/completion/provider_specific_params#proxy-usage)

 ## Image Generation
-Use this for stable diffusion on bedrock
+Use this for stable diffusion, and amazon nova canvas on bedrock


 ### Usage
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
 ```python
 import os
 from litellm import image_generation
@ -1659,6 +1834,41 @@ response = image_generation(
        )
 print(f"response: {response}")
 ```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+  - model_name: amazon.nova-canvas-v1:0
+    litellm_params:
+      model: bedrock/amazon.nova-canvas-v1:0
+      aws_region_name: "us-east-1"
+      aws_secret_access_key: my-key # OPTIONAL - all boto3 auth params supported
+      aws_secret_access_id: my-id # OPTIONAL - all boto3 auth params supported
+```
+
+2. Start proxy 
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```bash
+curl -L -X POST 'http://0.0.0.0:4000/v1/images/generations' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer $LITELLM_VIRTUAL_KEY' \
+-d '{
+    "model": "amazon.nova-canvas-v1:0",
+    "prompt": "A cute baby sea otter"
+}'
+```
+
+</TabItem>
+</Tabs>

 ## Supported AWS Bedrock Image Generation Models

@ -1739,6 +1949,8 @@ curl http://0.0.0.0:4000/rerank \
        "Capital punishment has existed in the United States since before it was a country."
    ],
    "top_n": 3
+
+
  }'
 ```

--- a/docs/my-website/docs/providers/vertex.md
+++ b/docs/my-website/docs/providers/vertex.md
@ -404,14 +404,16 @@ curl http://localhost:4000/v1/chat/completions \
 If this was your initial VertexAI Grounding code,

 ```python
-import vertexai 
+import vertexai
+from vertexai.generative_models import GenerativeModel, GenerationConfig, Tool, grounding
+

 vertexai.init(project=project_id, location="us-central1")

 model = GenerativeModel("gemini-1.5-flash-001")

 # Use Google Search for grounding
-tool = Tool.from_google_search_retrieval(grounding.GoogleSearchRetrieval(disable_attributon=False))
+tool = Tool.from_google_search_retrieval(grounding.GoogleSearchRetrieval())

 prompt = "When is the next total solar eclipse in US?"
 response = model.generate_content(
@ -428,7 +430,7 @@ print(response)
 then, this is what it looks like now

 ```python
-from litellm import completion 
+from litellm import completion


 # !gcloud auth application-default login - run this to add vertex credentials to your env
@ -1686,6 +1688,14 @@ assert isinstance(

 Pass any file supported by Vertex AI, through LiteLLM. 

+LiteLLM Supports the following image types passed in url
+
+```
+Images with Cloud Storage URIs - gs://cloud-samples-data/generative-ai/image/boats.jpeg
+Images with direct links - https://storage.googleapis.com/github-repo/img/gemini/intro/landmark3.jpg
+Videos with Cloud Storage URIs - https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/pixel8.mp4
+Base64 Encoded Local Images
+```

 <Tabs>
 <TabItem value="sdk" label="SDK">
--- a/docs/my-website/docs/proxy/db_info.md
+++ b/docs/my-website/docs/proxy/db_info.md
@ -46,18 +46,17 @@ You can see the full DB Schema [here](https://github.com/BerriAI/litellm/blob/ma

 | Table Name | Description | Row Insert Frequency |
 |------------|-------------|---------------------|
-| LiteLLM_SpendLogs | Detailed logs of all API requests. Records token usage, spend, and timing information. Tracks which models and keys were used. | **High - every LLM API request** |
-| LiteLLM_ErrorLogs | Captures failed requests and errors. Stores exception details and request information. Helps with debugging and monitoring. | **Medium - on errors only** |
+| LiteLLM_SpendLogs | Detailed logs of all API requests. Records token usage, spend, and timing information. Tracks which models and keys were used. | **High - every LLM API request - Success or Failure** |
 | LiteLLM_AuditLog | Tracks changes to system configuration. Records who made changes and what was modified. Maintains history of updates to teams, users, and models. | **Off by default**, **High - when enabled** |

-## Disable `LiteLLM_SpendLogs` & `LiteLLM_ErrorLogs`
+## Disable `LiteLLM_SpendLogs`

 You can disable spend_logs and error_logs by setting `disable_spend_logs` and `disable_error_logs` to `True` on the `general_settings` section of your proxy_config.yaml file.

 ```yaml
 general_settings:
  disable_spend_logs: True   # Disable writing spend logs to DB
-  disable_error_logs: True   # Disable writing error logs to DB
+  disable_error_logs: True   # Only disable writing error logs to DB, regular spend logs will still be written unless `disable_spend_logs: True`
 ```

 ### What is the impact of disabling these logs?
--- a/docs/my-website/docs/proxy/logging_spec.md
+++ b/docs/my-website/docs/proxy/logging_spec.md
@ -78,6 +78,7 @@ Inherits from `StandardLoggingUserAPIKeyMetadata` and adds:
 | `api_base` | `Optional[str]` | Optional API base URL |
 | `response_cost` | `Optional[str]` | Optional response cost |
 | `additional_headers` | `Optional[StandardLoggingAdditionalHeaders]` | Additional headers |
+| `batch_models` | `Optional[List[str]]` | Only set for Batches API. Lists the models used for cost calculation |

 ## StandardLoggingModelInformation

--- a/docs/my-website/docs/proxy/master_key_rotations.md
+++ b/docs/my-website/docs/proxy/master_key_rotations.md
@ -0,0 +1,53 @@
+# Rotating Master Key
+
+Here are our recommended steps for rotating your master key.
+
+
+**1. Backup your DB**
+In case of any errors during the encryption/de-encryption process, this will allow you to revert back to current state without issues.
+
+**2. Call `/key/regenerate` with the new master key**
+
+```bash
+curl -L -X POST 'http://localhost:4000/key/regenerate' \
+-H 'Authorization: Bearer sk-1234' \
+-H 'Content-Type: application/json' \
+-d '{
+  "key": "sk-1234",
+  "new_master_key": "sk-PIp1h0RekR"
+}'
+```
+
+This will re-encrypt any models in your Proxy_ModelTable with the new master key.
+
+Expect to start seeing decryption errors in logs, as your old master key is no longer able to decrypt the new values.
+
+```bash
+   raise Exception("Unable to decrypt value={}".format(v))
+Exception: Unable to decrypt value=<new-encrypted-value>
+```
+
+**3. Update LITELLM_MASTER_KEY**
+
+In your environment variables update the value of LITELLM_MASTER_KEY to the new_master_key from Step 2.
+
+This ensures the key used for decryption from db is the new key.
+
+**4. Test it**
+
+Make a test request to a model stored on proxy with a litellm key (new master key or virtual key) and see if it works
+
+```bash
+ curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+    "model": "gpt-4o-mini", # 👈 REPLACE with 'public model name' for any db-model
+    "messages": [
+        {
+            "content": "Hey, how's it going",
+            "role": "user"
+        }
+    ],
+}'
+```
--- a/docs/my-website/docs/proxy/prod.md
+++ b/docs/my-website/docs/proxy/prod.md
@ -107,9 +107,9 @@ general_settings:

 By default, LiteLLM writes several types of logs to the database:
 - Every LLM API request to the `LiteLLM_SpendLogs` table
- LLM Exceptions to the `LiteLLM_LogsErrors` table
+- LLM Exceptions to the `LiteLLM_SpendLogs` table

-If you're not viewing these logs on the LiteLLM UI (most users use Prometheus for monitoring), you can disable them by setting the following flags to `True`:
+If you're not viewing these logs on the LiteLLM UI, you can disable them by setting the following flags to `True`:

 ```yaml
 general_settings:
--- a/docs/my-website/docs/proxy/release_cycle.md
+++ b/docs/my-website/docs/proxy/release_cycle.md
@ -4,7 +4,7 @@ Litellm Proxy has the following release cycle:

 - `v1.x.x-nightly`: These are releases which pass ci/cd. 
 - `v1.x.x.rc`: These are releases which pass ci/cd + [manual review](https://github.com/BerriAI/litellm/discussions/8495#discussioncomment-12180711).
- `v1.x.x`: These are releases which pass ci/cd + manual review + 3 days of production testing.
+- `v1.x.x` OR `v1.x.x-stable`: These are releases which pass ci/cd + manual review + 3 days of production testing.

 In production, we recommend using the latest `v1.x.x` release.

--- a/docs/my-website/docs/realtime.md
+++ b/docs/my-website/docs/realtime.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# Realtime Endpoints
+# /realtime

 Use this to loadbalance across Azure + OpenAI. 

--- a/docs/my-website/docs/reasoning_content.md
+++ b/docs/my-website/docs/reasoning_content.md
@ -3,6 +3,12 @@ import TabItem from '@theme/TabItem';

 # 'Thinking' / 'Reasoning Content'

+:::info
+
+Requires LiteLLM v1.63.0+
+
+:::
+
 Supported Providers:
 - Deepseek (`deepseek/`)
 - Anthropic API (`anthropic/`)
@ -17,7 +23,7 @@ Supported Providers:
        {
            "type": "thinking",
            "thinking": "The capital of France is Paris.",
-            "signature_delta": "EqoBCkgIARABGAIiQL2UoU0b1OHYi+..."
+            "signature": "EqoBCkgIARABGAIiQL2UoU0b1OHYi+..."
        }
    ]
 }
@ -292,7 +298,7 @@ curl http://0.0.0.0:4000/v1/chat/completions \
          {
            "type": "thinking",
            "thinking": "The user is asking for the current weather in three different locations: San Francisco, Tokyo, and Paris. I have access to the `get_current_weather` function that can provide this information.\n\nThe function requires a `location` parameter, and has an optional `unit` parameter. The user hasn't specified which unit they prefer (celsius or fahrenheit), so I'll use the default provided by the function.\n\nI need to make three separate function calls, one for each location:\n1. San Francisco\n2. Tokyo\n3. Paris\n\nThen I'll compile the results into a response with three distinct weather reports as requested by the user.",
-            "signature_delta": "EqoBCkgIARABGAIiQCkBXENoyB+HstUOs/iGjG+bvDbIQRrxPsPpOSt5yDxX6iulZ/4K/w9Rt4J5Nb2+3XUYsyOH+CpZMfADYvItFR4SDPb7CmzoGKoolCMAJRoM62p1ZRASZhrD3swqIjAVY7vOAFWKZyPEJglfX/60+bJphN9W1wXR6rWrqn3MwUbQ5Mb/pnpeb10HMploRgUqEGKOd6fRKTkUoNDuAnPb55c="
+            "signature": "EqoBCkgIARABGAIiQCkBXENoyB+HstUOs/iGjG+bvDbIQRrxPsPpOSt5yDxX6iulZ/4K/w9Rt4J5Nb2+3XUYsyOH+CpZMfADYvItFR4SDPb7CmzoGKoolCMAJRoM62p1ZRASZhrD3swqIjAVY7vOAFWKZyPEJglfX/60+bJphN9W1wXR6rWrqn3MwUbQ5Mb/pnpeb10HMploRgUqEGKOd6fRKTkUoNDuAnPb55c="
          }
        ],
        "provider_specific_fields": {
@ -353,5 +359,5 @@ These fields can be accessed via `response.choices[0].message.reasoning_content`
 - `thinking_blocks` - Optional[List[Dict[str, str]]]: A list of thinking blocks from the model. Only returned for Anthropic models.
  - `type` - str: The type of thinking block.
  - `thinking` - str: The thinking from the model.
-  - `signature_delta` - str: The signature delta from the model.
+  - `signature` - str: The signature delta from the model.

--- a/docs/my-website/docs/rerank.md
+++ b/docs/my-website/docs/rerank.md
@ -1,4 +1,4 @@
-# Rerank
+# /rerank

 :::tip

--- a/docs/my-website/docs/response_api.md
+++ b/docs/my-website/docs/response_api.md
@ -0,0 +1,117 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# /responses [Beta]
+
+LiteLLM provides a BETA endpoint in the spec of [OpenAI's `/responses` API](https://platform.openai.com/docs/api-reference/responses)
+
+| Feature | Supported | Notes |
+|---------|-----------|--------|
+| Cost Tracking | ✅ | Works with all supported models |
+| Logging | ✅ | Works across all integrations |
+| End-user Tracking | ✅ | |
+| Streaming | ✅ | |
+| Fallbacks | ✅ | Works between supported models |
+| Loadbalancing | ✅ | Works between supported models |
+| Supported LiteLLM Versions | 1.63.8+ | |
+| Supported LLM providers | `openai` | |
+
+## Usage
+
+## Create a model response
+
+<Tabs>
+<TabItem value="litellm-sdk" label="LiteLLM SDK">
+
+#### Non-streaming
+```python
+import litellm
+
+# Non-streaming response
+response = litellm.responses(
+    model="gpt-4o",
+    input="Tell me a three sentence bedtime story about a unicorn.",
+    max_output_tokens=100
+)
+
+print(response)
+```
+
+#### Streaming
+```python
+import litellm
+
+# Streaming response
+response = litellm.responses(
+    model="gpt-4o",
+    input="Tell me a three sentence bedtime story about a unicorn.",
+    stream=True
+)
+
+for event in response:
+    print(event)
+```
+
+</TabItem>
+<TabItem value="proxy" label="OpenAI SDK with LiteLLM Proxy">
+
+First, add this to your litellm proxy config.yaml:
+```yaml
+model_list:
+  - model_name: gpt-4o
+    litellm_params:
+      model: openai/gpt-4
+      api_key: os.environ/OPENAI_API_KEY
+```
+
+Start your LiteLLM proxy:
+```bash
+litellm --config /path/to/config.yaml
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+Then use the OpenAI SDK pointed to your proxy:
+
+#### Non-streaming
+```python
+from openai import OpenAI
+
+# Initialize client with your proxy URL
+client = OpenAI(
+    base_url="http://localhost:4000",  # Your proxy URL
+    api_key="your-api-key"             # Your proxy API key
+)
+
+# Non-streaming response
+response = client.responses.create(
+    model="gpt-4o",
+    input="Tell me a three sentence bedtime story about a unicorn."
+)
+
+print(response)
+```
+
+#### Streaming
+```python
+from openai import OpenAI
+
+# Initialize client with your proxy URL
+client = OpenAI(
+    base_url="http://localhost:4000",  # Your proxy URL
+    api_key="your-api-key"             # Your proxy API key
+)
+
+# Streaming response
+response = client.responses.create(
+    model="gpt-4o",
+    input="Tell me a three sentence bedtime story about a unicorn.",
+    stream=True
+)
+
+for event in response:
+    print(event)
+```
+
+</TabItem>
+</Tabs>
--- a/docs/my-website/docs/routing.md
+++ b/docs/my-website/docs/routing.md
@ -952,8 +952,8 @@ router_settings:
 ```

 Defaults:
- allowed_fails: 0
- cooldown_time: 60s 
+- allowed_fails: 3
+- cooldown_time: 5s (`DEFAULT_COOLDOWN_TIME_SECONDS` in constants.py)

 **Set Per Model**

--- a/docs/my-website/docs/secret.md
+++ b/docs/my-website/docs/secret.md
@ -96,6 +96,33 @@ litellm --config /path/to/config.yaml
 ```


+### Using K/V pairs in 1 AWS Secret
+
+You can read multiple keys from a single AWS Secret using the `primary_secret_name` parameter:
+
+```yaml
+general_settings:
+  key_management_system: "aws_secret_manager"
+  key_management_settings:
+    hosted_keys: [
+      "OPENAI_API_KEY_MODEL_1",
+      "OPENAI_API_KEY_MODEL_2",
+    ]
+    primary_secret_name: "litellm_secrets" # 👈 Read multiple keys from one JSON secret
+```
+
+The `primary_secret_name` allows you to read multiple keys from a single AWS Secret as a JSON object. For example, the "litellm_secrets" would contain:
+
+```json
+{
+  "OPENAI_API_KEY_MODEL_1": "sk-key1...",
+  "OPENAI_API_KEY_MODEL_2": "sk-key2..."
+}
+```
+
+This reduces the number of AWS Secrets you need to manage.
+
+
 ## Hashicorp Vault


@ -353,4 +380,7 @@ general_settings:
    
    # Hosted Keys Settings
    hosted_keys: ["litellm_master_key"] # OPTIONAL. Specify which env keys you stored on AWS
+
+    # K/V pairs in 1 AWS Secret Settings
+    primary_secret_name: "litellm_secrets" # OPTIONAL. Read multiple keys from one JSON secret on AWS Secret Manager
 ```
--- a/docs/my-website/docs/text_completion.md
+++ b/docs/my-website/docs/text_completion.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# Text Completion
+# /completions

 ### Usage
 <Tabs>
--- a/docs/my-website/docs/tutorials/litellm_proxy_aporia.md
+++ b/docs/my-website/docs/tutorials/litellm_proxy_aporia.md
@ -2,9 +2,9 @@ import Image from '@theme/IdealImage';
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# Use LiteLLM AI Gateway with Aporia Guardrails
+# Aporia Guardrails with LiteLLM Gateway

-In this tutorial we will use LiteLLM Proxy with Aporia to detect PII in requests and profanity in responses
+In this tutorial we will use LiteLLM AI Gateway with Aporia to detect PII in requests and profanity in responses

 ## 1. Setup guardrails on Aporia

--- a/docs/my-website/docs/tutorials/openweb_ui.md
+++ b/docs/my-website/docs/tutorials/openweb_ui.md
@ -0,0 +1,103 @@
+import Image from '@theme/IdealImage';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# OpenWeb UI with LiteLLM
+
+This guide walks you through connecting OpenWeb UI to LiteLLM. Using LiteLLM with OpenWeb UI allows teams to 
+- Access 100+ LLMs on OpenWeb UI
+- Track Spend / Usage, Set Budget Limits 
+- Send Request/Response Logs to logging destinations like langfuse, s3, gcs buckets, etc.
+- Set access controls eg. Control what models OpenWebUI can access.
+
+## Quickstart
+
+- Make sure to setup LiteLLM with the [LiteLLM Getting Started Guide](https://docs.litellm.ai/docs/proxy/docker_quick_start)
+
+
+## 1. Start LiteLLM & OpenWebUI
+
+- OpenWebUI starts running on [http://localhost:3000](http://localhost:3000)
+- LiteLLM starts running on [http://localhost:4000](http://localhost:4000)
+
+
+## 2. Create a Virtual Key on LiteLLM
+
+Virtual Keys are API Keys that allow you to authenticate to LiteLLM Proxy. We will create a Virtual Key that will allow OpenWebUI to access LiteLLM.
+
+### 2.1 LiteLLM User Management Hierarchy
+
+On LiteLLM, you can create Organizations, Teams, Users and Virtual Keys. For this tutorial, we will create a Team and a Virtual Key.
+
+- `Organization` - An Organization is a group of Teams. (US Engineering, EU Developer Tools)
+- `Team` - A Team is a group of Users. (OpenWeb UI Team, Data Science Team, etc.)
+- `User` - A User is an individual user (employee, developer, eg. `krrish@litellm.ai`)
+- `Virtual Key` - A Virtual Key is an API Key that allows you to authenticate to LiteLLM Proxy. A Virtual Key is associated with a User or Team.
+
+Once the Team is created, you can invite Users to the Team. You can read more about LiteLLM's User Management [here](https://docs.litellm.ai/docs/proxy/user_management_heirarchy).
+
+### 2.2 Create a Team on LiteLLM
+
+Navigate to [http://localhost:4000/ui](http://localhost:4000/ui) and create a new team.
+
+<Image img={require('../../img/litellm_create_team.gif')} />
+
+### 2.2 Create a Virtual Key on LiteLLM
+
+Navigate to [http://localhost:4000/ui](http://localhost:4000/ui) and create a new virtual Key. 
+
+LiteLLM allows you to specify what models are available on OpenWeb UI (by specifying the models the key will have access to).
+
+<Image img={require('../../img/create_key_in_team_oweb.gif')} />
+
+## 3. Connect OpenWeb UI to LiteLLM
+
+On OpenWeb UI, navigate to Settings -> Connections and create a new connection to LiteLLM
+
+Enter the following details:
+- URL: `http://localhost:4000` (your litellm proxy base url)
+- Key: `your-virtual-key` (the key you created in the previous step)
+
+<Image img={require('../../img/litellm_setup_openweb.gif')} />
+
+### 3.1 Test Request
+
+On the top left corner, select models you should only see the models you gave the key access to in Step 2.
+
+Once you selected a model, enter your message content and click on `Submit`
+
+<Image img={require('../../img/basic_litellm.gif')} />
+
+### 3.2 Tracking Spend / Usage
+
+After your request is made, navigate to `Logs` on the LiteLLM UI, you can see Team, Key, Model, Usage and Cost.
+
+<!-- <Image img={require('../../img/litellm_logs_openweb.gif')} /> -->
+
+
+
+## Render `thinking` content on OpenWeb UI
+
+OpenWebUI requires reasoning/thinking content to be rendered with `<think></think>` tags. In order to render this for specific models, you can use the `merge_reasoning_content_in_choices` litellm parameter.
+
+Example litellm config.yaml:
+
+```yaml
+model_list:
+  - model_name: thinking-anthropic-claude-3-7-sonnet
+    litellm_params:
+      model: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0
+      thinking: {"type": "enabled", "budget_tokens": 1024}
+      max_tokens: 1080
+      merge_reasoning_content_in_choices: true
+```
+
+### Test it on OpenWeb UI
+
+On the models dropdown select `thinking-anthropic-claude-3-7-sonnet`
+
+<Image img={require('../../img/litellm_thinking_openweb.gif')} />
+
+
+
+
--- a/docs/my-website/docusaurus.config.js
+++ b/docs/my-website/docusaurus.config.js
@ -44,7 +44,7 @@ const config = {
        path: './release_notes',
        routeBasePath: 'release_notes',
        blogTitle: 'Release Notes',
-        blogSidebarTitle: 'All Releases',
+        blogSidebarTitle: 'Releases',
        blogSidebarCount: 'ALL',
        postsPerPage: 'ALL',
        showReadingTime: false,
--- a/docs/my-website/img/basic_litellm.gif
+++ b/docs/my-website/img/basic_litellm.gif
--- a/docs/my-website/img/create_key_in_team_oweb.gif
+++ b/docs/my-website/img/create_key_in_team_oweb.gif
--- a/docs/my-website/img/litellm_create_team.gif
+++ b/docs/my-website/img/litellm_create_team.gif
--- a/docs/my-website/img/litellm_setup_openweb.gif
+++ b/docs/my-website/img/litellm_setup_openweb.gif
--- a/docs/my-website/img/litellm_thinking_openweb.gif
+++ b/docs/my-website/img/litellm_thinking_openweb.gif
--- a/docs/my-website/img/release_notes/anthropic_thinking.jpg
+++ b/docs/my-website/img/release_notes/anthropic_thinking.jpg
--- a/docs/my-website/img/release_notes/error_logs.jpg
+++ b/docs/my-website/img/release_notes/error_logs.jpg
--- a/docs/my-website/img/release_notes/v1632_release.jpg
+++ b/docs/my-website/img/release_notes/v1632_release.jpg
--- a/docs/my-website/package-lock.json
+++ b/docs/my-website/package-lock.json
@ -706,12 +706,13 @@
      }
    },
    "node_modules/@babel/helpers": {
-      "version": "7.26.0",
-      "resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.26.0.tgz",
-      "integrity": "sha512-tbhNuIxNcVb21pInl3ZSjksLCvgdZy9KwJ8brv993QtIVKJBBkYXz4q4ZbAv31GdnC+R90np23L5FbEBlthAEw==",
+      "version": "7.26.10",
+      "resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.26.10.tgz",
+      "integrity": "sha512-UPYc3SauzZ3JGgj87GgZ89JVdC5dj0AoetR5Bw6wj4niittNyFh6+eOGonYvJ1ao6B8lEa3Q3klS7ADZ53bc5g==",
+      "license": "MIT",
      "dependencies": {
-        "@babel/template": "^7.25.9",
-        "@babel/types": "^7.26.0"
+        "@babel/template": "^7.26.9",
+        "@babel/types": "^7.26.10"
      },
      "engines": {
        "node": ">=6.9.0"
@ -796,11 +797,12 @@
      }
    },
    "node_modules/@babel/parser": {
-      "version": "7.26.3",
-      "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.26.3.tgz",
-      "integrity": "sha512-WJ/CvmY8Mea8iDXo6a7RK2wbmJITT5fN3BEkRuFlxVyNx8jOKIIhmC4fSkTcPcf8JyavbBwIe6OpiCOBXt/IcA==",
+      "version": "7.26.10",
+      "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.26.10.tgz",
+      "integrity": "sha512-6aQR2zGE/QFi8JpDLjUZEPYOs7+mhKXm86VaKFiLP35JQwQb6bwUE+XbvkH0EptsYhbNBSUGaUBLKqxH1xSgsA==",
+      "license": "MIT",
      "dependencies": {
-        "@babel/types": "^7.26.3"
+        "@babel/types": "^7.26.10"
      },
      "bin": {
        "parser": "bin/babel-parser.js"
@ -2157,9 +2159,10 @@
      }
    },
    "node_modules/@babel/runtime-corejs3": {
-      "version": "7.26.0",
-      "resolved": "https://registry.npmjs.org/@babel/runtime-corejs3/-/runtime-corejs3-7.26.0.tgz",
-      "integrity": "sha512-YXHu5lN8kJCb1LOb9PgV6pvak43X2h4HvRApcN5SdWeaItQOzfn1hgP6jasD6KWQyJDBxrVmA9o9OivlnNJK/w==",
+      "version": "7.26.10",
+      "resolved": "https://registry.npmjs.org/@babel/runtime-corejs3/-/runtime-corejs3-7.26.10.tgz",
+      "integrity": "sha512-uITFQYO68pMEYR46AHgQoyBg7KPPJDAbGn4jUTIRgCFJIp88MIBUianVOplhZDEec07bp9zIyr4Kp0FCyQzmWg==",
+      "license": "MIT",
      "dependencies": {
        "core-js-pure": "^3.30.2",
        "regenerator-runtime": "^0.14.0"
@ -2169,13 +2172,14 @@
      }
    },
    "node_modules/@babel/template": {
-      "version": "7.25.9",
-      "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.25.9.tgz",
-      "integrity": "sha512-9DGttpmPvIxBb/2uwpVo3dqJ+O6RooAFOS+lB+xDqoE2PVCE8nfoHMdZLpfCQRLwvohzXISPZcgxt80xLfsuwg==",
+      "version": "7.26.9",
+      "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.26.9.tgz",
+      "integrity": "sha512-qyRplbeIpNZhmzOysF/wFMuP9sctmh2cFzRAZOn1YapxBsE1i9bJIY586R/WBLfLcmcBlM8ROBiQURnnNy+zfA==",
+      "license": "MIT",
      "dependencies": {
-        "@babel/code-frame": "^7.25.9",
-        "@babel/parser": "^7.25.9",
-        "@babel/types": "^7.25.9"
+        "@babel/code-frame": "^7.26.2",
+        "@babel/parser": "^7.26.9",
+        "@babel/types": "^7.26.9"
      },
      "engines": {
        "node": ">=6.9.0"
@ -2199,9 +2203,10 @@
      }
    },
    "node_modules/@babel/types": {
-      "version": "7.26.3",
-      "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.26.3.tgz",
-      "integrity": "sha512-vN5p+1kl59GVKMvTHt55NzzmYVxprfJD+ql7U9NFIfKCBkYE55LYtS+WtPlaYOyzydrKI8Nezd+aZextrd+FMA==",
+      "version": "7.26.10",
+      "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.26.10.tgz",
+      "integrity": "sha512-emqcG3vHrpxUKTrxcblR36dcrcoRDvKmnL/dCL6ZsHaShW80qxCAcNhzQZrpeM765VzEos+xOi4s+r4IXzTwdQ==",
+      "license": "MIT",
      "dependencies": {
        "@babel/helper-string-parser": "^7.25.9",
        "@babel/helper-validator-identifier": "^7.25.9"
--- a/docs/my-website/release_notes/v1.61.20-stable/index.md
+++ b/docs/my-website/release_notes/v1.61.20-stable/index.md
@ -20,12 +20,6 @@ import Image from '@theme/IdealImage';
 # v1.61.20-stable


-:::info
-
-`v1.61.20-stable` will be live on 2025-02-04. 
-
-:::
-
 These are the changes since `v1.61.13-stable`.

 This release is primarily focused on:
--- a/docs/my-website/release_notes/v1.63.0/index.md
+++ b/docs/my-website/release_notes/v1.63.0/index.md
@ -0,0 +1,40 @@
+---
+title: v1.63.0 - Anthropic 'thinking' response update
+slug: v1.63.0
+date: 2025-03-05T10:00:00
+authors:
+  - name: Krrish Dholakia
+    title: CEO, LiteLLM
+    url: https://www.linkedin.com/in/krish-d/
+    image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI
+  - name: Ishaan Jaffer
+    title: CTO, LiteLLM
+    url: https://www.linkedin.com/in/reffajnaahsi/
+    image_url: https://media.licdn.com/dms/image/v2/D4D03AQGiM7ZrUwqu_Q/profile-displayphoto-shrink_800_800/profile-displayphoto-shrink_800_800/0/1675971026692?e=1741824000&v=beta&t=eQnRdXPJo4eiINWTZARoYTfqh064pgZ-E21pQTSy8jc
+tags: [llm translation, thinking, reasoning_content, claude-3-7-sonnet]
+hide_table_of_contents: false
+---
+
+v1.63.0 fixes Anthropic 'thinking' response on streaming to return the `signature` block. [Github Issue](https://github.com/BerriAI/litellm/issues/8964)
+
+
+
+It also moves the response structure from `signature_delta` to `signature` to be the same as Anthropic. [Anthropic Docs](https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#implementing-extended-thinking)
+
+
+## Diff 
+
+```bash
+"message": {
+    ...
+    "reasoning_content": "The capital of France is Paris.",
+    "thinking_blocks": [
+        {
+            "type": "thinking",
+            "thinking": "The capital of France is Paris.",
+-            "signature_delta": "EqoBCkgIARABGAIiQL2UoU0b1OHYi+..." # 👈 OLD FORMAT
+            "signature": "EqoBCkgIARABGAIiQL2UoU0b1OHYi+..." # 👈 KEY CHANGE
+        }
+    ]
+}
+```
--- a/docs/my-website/release_notes/v1.63.2-stable/index.md
+++ b/docs/my-website/release_notes/v1.63.2-stable/index.md
@ -0,0 +1,112 @@
+---
+title: v1.63.2-stable
+slug: v1.63.2-stable
+date: 2025-03-08T10:00:00
+authors:
+  - name: Krrish Dholakia
+    title: CEO, LiteLLM
+    url: https://www.linkedin.com/in/krish-d/
+    image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI
+  - name: Ishaan Jaffer
+    title: CTO, LiteLLM
+    url: https://www.linkedin.com/in/reffajnaahsi/
+    image_url: https://media.licdn.com/dms/image/v2/D4D03AQGiM7ZrUwqu_Q/profile-displayphoto-shrink_800_800/profile-displayphoto-shrink_800_800/0/1675971026692?e=1741824000&v=beta&t=eQnRdXPJo4eiINWTZARoYTfqh064pgZ-E21pQTSy8jc
+tags: [llm translation, thinking, reasoning_content, claude-3-7-sonnet]
+hide_table_of_contents: false
+---
+
+import Image from '@theme/IdealImage';
+
+
+These are the changes since `v1.61.20-stable`.
+
+This release is primarily focused on:
+- LLM Translation improvements (more `thinking` content improvements)
+- UI improvements (Error logs now shown on UI)
+
+
+:::info
+
+This release will be live on 03/09/2025
+
+::: 
+
+<Image img={require('../../img/release_notes/v1632_release.jpg')} />
+
+
+## Demo Instance
+
+Here's a Demo Instance to test changes:
+- Instance: https://demo.litellm.ai/
+- Login Credentials:
+    - Username: admin
+    - Password: sk-1234
+
+
+## New Models / Updated Models
+
+1. Add `supports_pdf_input` for specific Bedrock Claude models [PR](https://github.com/BerriAI/litellm/commit/f63cf0030679fe1a43d03fb196e815a0f28dae92)
+2. Add pricing for amazon `eu` models [PR](https://github.com/BerriAI/litellm/commits/main/model_prices_and_context_window.json)
+3. Fix Azure O1 mini pricing [PR](https://github.com/BerriAI/litellm/commit/52de1949ef2f76b8572df751f9c868a016d4832c)
+
+## LLM Translation
+
+<Image img={require('../../img/release_notes/anthropic_thinking.jpg')}/>
+
+1. Support `/openai/` passthrough for Assistant endpoints. [Get Started](https://docs.litellm.ai/docs/pass_through/openai_passthrough)
+2. Bedrock Claude - fix tool calling transformation on invoke route. [Get Started](../../docs/providers/bedrock#usage---function-calling--tool-calling)
+3. Bedrock Claude - response_format support for claude on invoke route. [Get Started](../../docs/providers/bedrock#usage---structured-output--json-mode)
+4. Bedrock - pass `description` if set in response_format. [Get Started](../../docs/providers/bedrock#usage---structured-output--json-mode)
+5. Bedrock - Fix passing response_format: {"type": "text"}. [PR](https://github.com/BerriAI/litellm/commit/c84b489d5897755139aa7d4e9e54727ebe0fa540)
+6. OpenAI - Handle sending image_url as str to openai. [Get Started](https://docs.litellm.ai/docs/completion/vision)
+7. Deepseek - return 'reasoning_content' missing on streaming. [Get Started](https://docs.litellm.ai/docs/reasoning_content)
+8. Caching - Support caching on reasoning content. [Get Started](https://docs.litellm.ai/docs/proxy/caching)
+9. Bedrock - handle thinking blocks in assistant message. [Get Started](https://docs.litellm.ai/docs/providers/bedrock#usage---thinking--reasoning-content)
+10. Anthropic - Return `signature` on streaming. [Get Started](https://docs.litellm.ai/docs/providers/bedrock#usage---thinking--reasoning-content)
+- Note: We've also migrated from `signature_delta` to `signature`. [Read more](https://docs.litellm.ai/release_notes/v1.63.0)
+11. Support format param for specifying image type. [Get Started](../../docs/completion/vision.md#explicitly-specify-image-type)
+12. Anthropic - `/v1/messages` endpoint - `thinking` param support. [Get Started](../../docs/anthropic_unified.md)
+- Note: this refactors the [BETA] unified `/v1/messages` endpoint, to just work for the Anthropic API. 
+13. Vertex AI - handle $id in response schema when calling vertex ai. [Get Started](https://docs.litellm.ai/docs/providers/vertex#json-schema)
+
+## Spend Tracking Improvements
+
+1. Batches API - Fix cost calculation to run on retrieve_batch. [Get Started](https://docs.litellm.ai/docs/batches)
+2. Batches API - Log batch models in spend logs / standard logging payload. [Get Started](../../docs/proxy/logging_spec.md#standardlogginghiddenparams)
+
+## Management Endpoints / UI
+
+<Image img={require('../../img/release_notes/error_logs.jpg')} />
+
+1. Virtual Keys Page
+    - Allow team/org filters to be searchable on the Create Key Page
+    - Add created_by and updated_by fields to Keys table
+    - Show 'user_email' on key table
+    - Show 100 Keys Per Page, Use full height, increase width of key alias
+2. Logs Page
+    - Show Error Logs on LiteLLM UI
+    - Allow Internal Users to View their own logs
+3. Internal Users Page 
+    - Allow admin to control default model access for internal users
+7. Fix session handling with cookies
+
+## Logging / Guardrail Integrations
+
+1. Fix prometheus metrics w/ custom metrics, when keys containing team_id make requests. [PR](https://github.com/BerriAI/litellm/pull/8935)
+
+## Performance / Loadbalancing / Reliability improvements
+
+1. Cooldowns - Support cooldowns on models called with client side credentials. [Get Started](https://docs.litellm.ai/docs/proxy/clientside_auth#pass-user-llm-api-keys--api-base)
+2. Tag-based Routing - ensures tag-based routing across all endpoints (`/embeddings`, `/image_generation`, etc.). [Get Started](https://docs.litellm.ai/docs/proxy/tag_routing)
+
+## General Proxy Improvements
+
+1. Raise BadRequestError when unknown model passed in request
+2. Enforce model access restrictions on Azure OpenAI proxy route
+3. Reliability fix - Handle emoji’s in text - fix orjson error
+4. Model Access Patch - don't overwrite litellm.anthropic_models when running auth checks
+5. Enable setting timezone information in docker image 
+
+## Complete Git Diff
+
+[Here's the complete git diff](https://github.com/BerriAI/litellm/compare/v1.61.20-stable...v1.63.2-stable)
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -46,6 +46,7 @@ const sidebars = {
            "proxy/health",
            "proxy/debugging",
            "proxy/spending_monitoring",
+            "proxy/master_key_rotations",
          ],
        },
        "proxy/demo",
@ -257,17 +258,23 @@ const sidebars = {
        "completion/batching",
        "completion/mock_requests",
        "completion/reliable_completions",
-        'tutorials/litellm_proxy_aporia',

      ]
    },
    {
      type: "category",
      label: "Supported Endpoints",
+      link: {
+        type: "generated-index",
+        title: "Supported Endpoints",
+        description:
+          "Learn how to deploy + call models from different providers on LiteLLM",
+        slug: "/supported_endpoints",
+      },
      items: [
        {
          type: "category",
-          label: "Chat",
+          label: "/chat/completions",
          link: {
            type: "generated-index",
            title: "Chat Completions",
@ -280,11 +287,13 @@ const sidebars = {
            "completion/usage",
          ],
        },
+        "response_api",
        "text_completion",
        "embedding/supported_embedding",
+        "anthropic_unified",
        {
          type: "category",
-          label: "Image",
+          label: "/images",
          items: [
            "image_generation",
            "image_variations",
@ -292,7 +301,7 @@ const sidebars = {
        },
        {
          type: "category",
-          label: "Audio",
+          label: "/audio",
          "items": [
            "audio_transcription",
            "text_to_speech",
@ -351,23 +360,6 @@ const sidebars = {
          label: "LangChain, LlamaIndex, Instructor Integration",
          items: ["langchain/langchain", "tutorials/instructor"],
        },
-        {
-          type: "category",
-          label: "Tutorials",
-          items: [
-
-            'tutorials/azure_openai',
-            'tutorials/instructor',
-            "tutorials/gradio_integration",
-            "tutorials/huggingface_codellama",
-            "tutorials/huggingface_tutorial",
-            "tutorials/TogetherAI_liteLLM",
-            "tutorials/finetuned_chat_gpt",
-            "tutorials/text_completion",
-            "tutorials/first_playground",
-            "tutorials/model_fallbacks",
-          ],
-        },
      ],
    },
    {
@ -384,13 +376,6 @@ const sidebars = {
        "load_test_rpm",
      ]
    },
-    {
-      type: "category",
-      label: "Adding Providers",
-      items: [
-        "adding_provider/directory_structure",
-        "adding_provider/new_rerank_provider"],
-    },
    {
      type: "category",
      label: "Logging & Observability",
@ -425,12 +410,51 @@ const sidebars = {
        "observability/opik_integration",
      ],
    },
+    {
+      type: "category",
+      label: "Tutorials",
+      items: [
+        "tutorials/openweb_ui",
+        'tutorials/litellm_proxy_aporia',
+        {
+          type: "category",
+          label: "LiteLLM Python SDK Tutorials",
+          items: [

+            'tutorials/azure_openai',
+            'tutorials/instructor',
+            "tutorials/gradio_integration",
+            "tutorials/huggingface_codellama",
+            "tutorials/huggingface_tutorial",
+            "tutorials/TogetherAI_liteLLM",
+            "tutorials/finetuned_chat_gpt",
+            "tutorials/text_completion",
+            "tutorials/first_playground",
+            "tutorials/model_fallbacks",
+          ],
+        },
+      ]
+    },
+    {
+      type: "category",
+      label: "Contributing",
+      items: [
+        "extras/contributing_code",
+        {
+          type: "category",
+          label: "Adding Providers",
+          items: [
+            "adding_provider/directory_structure",
+            "adding_provider/new_rerank_provider"],
+        },
+        "extras/contributing",
+        "contributing",
+      ]
+    },
    {
      type: "category",
      label: "Extras",
      items: [
-        "extras/contributing",
        "data_security",
        "data_retention",
        "migration_policy",
@ -447,6 +471,7 @@ const sidebars = {
          items: [
            "projects/smolagents",
            "projects/Docq.AI",
+            "projects/PDL",
            "projects/OpenInterpreter",
            "projects/Elroy",
            "projects/dbally",
@ -462,9 +487,9 @@ const sidebars = {
            "projects/YiVal",
            "projects/LiteLLM Proxy",
            "projects/llm_cord",
+            "projects/pgai",
          ],
        },
-        "contributing",
        "proxy/pii_masking",
        "extras/code_quality",
        "rules",
--- a/enterprise/enterprise_hooks/aporia_ai.py
+++ b/enterprise/enterprise_hooks/aporia_ai.py
@ -163,7 +163,7 @@ class AporiaGuardrail(CustomGuardrail):

        pass

-    async def async_moderation_hook(  ### 👈 KEY CHANGE ###
+    async def async_moderation_hook(
        self,
        data: dict,
        user_api_key_dict: UserAPIKeyAuth,
@ -173,6 +173,7 @@ class AporiaGuardrail(CustomGuardrail):
            "image_generation",
            "moderation",
            "audio_transcription",
+            "responses",
        ],
    ):
        from litellm.proxy.common_utils.callback_utils import (
--- a/enterprise/enterprise_hooks/google_text_moderation.py
+++ b/enterprise/enterprise_hooks/google_text_moderation.py
@ -94,6 +94,7 @@ class _ENTERPRISE_GoogleTextModeration(CustomLogger):
            "image_generation",
            "moderation",
            "audio_transcription",
+            "responses",
        ],
    ):
        """
--- a/enterprise/enterprise_hooks/llama_guard.py
+++ b/enterprise/enterprise_hooks/llama_guard.py
@ -107,6 +107,7 @@ class _ENTERPRISE_LlamaGuard(CustomLogger):
            "image_generation",
            "moderation",
            "audio_transcription",
+            "responses",
        ],
    ):
        """
--- a/enterprise/enterprise_hooks/llm_guard.py
+++ b/enterprise/enterprise_hooks/llm_guard.py
@ -126,6 +126,7 @@ class _ENTERPRISE_LLMGuard(CustomLogger):
            "image_generation",
            "moderation",
            "audio_transcription",
+            "responses",
        ],
    ):
        """
--- a/enterprise/enterprise_hooks/openai_moderation.py
+++ b/enterprise/enterprise_hooks/openai_moderation.py
@ -31,7 +31,7 @@ class _ENTERPRISE_OpenAI_Moderation(CustomLogger):

    #### CALL HOOKS - proxy only ####

-    async def async_moderation_hook(  ### 👈 KEY CHANGE ###
+    async def async_moderation_hook(
        self,
        data: dict,
        user_api_key_dict: UserAPIKeyAuth,
@ -41,6 +41,7 @@ class _ENTERPRISE_OpenAI_Moderation(CustomLogger):
            "image_generation",
            "moderation",
            "audio_transcription",
+            "responses",
        ],
    ):
        text = ""
--- a/litellm/init.py
+++ b/litellm/init.py
@ -8,12 +8,14 @@ import os
 from typing import Callable, List, Optional, Dict, Union, Any, Literal, get_args
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.caching.caching import Cache, DualCache, RedisCache, InMemoryCache
+from litellm.caching.llm_caching_handler import LLMClientCache
 from litellm.types.llms.bedrock import COHERE_EMBEDDING_INPUT_TYPES
 from litellm.types.utils import (
    ImageObject,
    BudgetConfig,
    all_litellm_params,
    all_litellm_params as _litellm_completion_params,
+    CredentialItem,
 )  # maintain backwards compatibility for root param
 from litellm._logging import (
    set_verbose,
@ -190,15 +192,17 @@ ssl_verify: Union[str, bool] = True
 ssl_certificate: Optional[str] = None
 disable_streaming_logging: bool = False
 disable_add_transform_inline_image_block: bool = False
-in_memory_llm_clients_cache: InMemoryCache = InMemoryCache()
+in_memory_llm_clients_cache: LLMClientCache = LLMClientCache()
 safe_memory_mode: bool = False
 enable_azure_ad_token_refresh: Optional[bool] = False
 ### DEFAULT AZURE API VERSION ###
-AZURE_DEFAULT_API_VERSION = "2024-08-01-preview"  # this is updated to the latest
+AZURE_DEFAULT_API_VERSION = "2025-02-01-preview"  # this is updated to the latest
 ### DEFAULT WATSONX API VERSION ###
 WATSONX_DEFAULT_API_VERSION = "2024-03-13"
 ### COHERE EMBEDDINGS DEFAULT TYPE ###
 COHERE_DEFAULT_EMBEDDING_INPUT_TYPE: COHERE_EMBEDDING_INPUT_TYPES = "search_document"
+### CREDENTIALS ###
+credential_list: List[CredentialItem] = []
 ### GUARDRAILS ###
 llamaguard_model_name: Optional[str] = None
 openai_moderations_model_name: Optional[str] = None
@ -278,8 +282,6 @@ disable_end_user_cost_tracking_prometheus_only: Optional[bool] = None
 custom_prometheus_metadata_labels: List[str] = []
 #### REQUEST PRIORITIZATION ####
 priority_reservation: Optional[Dict[str, float]] = None
-
-
 force_ipv4: bool = (
    False  # when True, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6.
 )
@ -807,9 +809,6 @@ from .llms.oobabooga.chat.transformation import OobaboogaConfig
 from .llms.maritalk import MaritalkConfig
 from .llms.openrouter.chat.transformation import OpenrouterConfig
 from .llms.anthropic.chat.transformation import AnthropicConfig
-from .llms.anthropic.experimental_pass_through.transformation import (
-    AnthropicExperimentalPassThroughConfig,
-)
 from .llms.groq.stt.transformation import GroqSTTConfig
 from .llms.anthropic.completion.transformation import AnthropicTextConfig
 from .llms.triton.completion.transformation import TritonConfig
@ -829,6 +828,9 @@ from .llms.infinity.rerank.transformation import InfinityRerankConfig
 from .llms.jina_ai.rerank.transformation import JinaAIRerankConfig
 from .llms.clarifai.chat.transformation import ClarifaiConfig
 from .llms.ai21.chat.transformation import AI21ChatConfig, AI21ChatConfig as AI21Config
+from .llms.anthropic.experimental_pass_through.messages.transformation import (
+    AnthropicMessagesConfig,
+)
 from .llms.together_ai.chat import TogetherAIConfig
 from .llms.together_ai.completion.transformation import TogetherAITextCompletionConfig
 from .llms.cloudflare.chat.transformation import CloudflareChatConfig
@ -909,6 +911,7 @@ from .llms.bedrock.chat.invoke_transformations.base_invoke_transformation import

 from .llms.bedrock.image.amazon_stability1_transformation import AmazonStabilityConfig
 from .llms.bedrock.image.amazon_stability3_transformation import AmazonStability3Config
+from .llms.bedrock.image.amazon_nova_canvas_transformation import AmazonNovaCanvasConfig
 from .llms.bedrock.embed.amazon_titan_g1_transformation import AmazonTitanG1Config
 from .llms.bedrock.embed.amazon_titan_multimodal_transformation import (
    AmazonTitanMultimodalEmbeddingG1Config,
@ -931,6 +934,7 @@ from .llms.groq.chat.transformation import GroqChatConfig
 from .llms.voyage.embedding.transformation import VoyageEmbeddingConfig
 from .llms.azure_ai.chat.transformation import AzureAIStudioConfig
 from .llms.mistral.mistral_chat_transformation import MistralConfig
+from .llms.openai.responses.transformation import OpenAIResponsesAPIConfig
 from .llms.openai.chat.o_series_transformation import (
    OpenAIOSeriesConfig as OpenAIO1Config,  # maintain backwards compatibility
    OpenAIOSeriesConfig,
@ -1021,6 +1025,8 @@ from .assistants.main import *
 from .batches.main import *
 from .batch_completion.main import *  # type: ignore
 from .rerank_api.main import *
+from .llms.anthropic.experimental_pass_through.messages.handler import *
+from .responses.main import *
 from .realtime_api.main import _arealtime
 from .fine_tuning.main import *
 from .files.main import *
--- a/litellm/adapters/anthropic_adapter.py
+++ b/litellm/adapters/anthropic_adapter.py
@ -1,186 +0,0 @@
-# What is this?
-## Translates OpenAI call to Anthropic `/v1/messages` format
-import traceback
-from typing import Any, Optional
-
-import litellm
-from litellm import ChatCompletionRequest, verbose_logger
-from litellm.integrations.custom_logger import CustomLogger
-from litellm.types.llms.anthropic import AnthropicMessagesRequest, AnthropicResponse
-from litellm.types.utils import AdapterCompletionStreamWrapper, ModelResponse
-
-
-class AnthropicAdapter(CustomLogger):
-    def __init__(self) -> None:
-        super().__init__()
-
-    def translate_completion_input_params(
-        self, kwargs
-    ) -> Optional[ChatCompletionRequest]:
-        """
-        - translate params, where needed
-        - pass rest, as is
-        """
-        request_body = AnthropicMessagesRequest(**kwargs)  # type: ignore
-
-        translated_body = litellm.AnthropicExperimentalPassThroughConfig().translate_anthropic_to_openai(
-            anthropic_message_request=request_body
-        )
-
-        return translated_body
-
-    def translate_completion_output_params(
-        self, response: ModelResponse
-    ) -> Optional[AnthropicResponse]:
-
-        return litellm.AnthropicExperimentalPassThroughConfig().translate_openai_response_to_anthropic(
-            response=response
-        )
-
-    def translate_completion_output_params_streaming(
-        self, completion_stream: Any
-    ) -> AdapterCompletionStreamWrapper | None:
-        return AnthropicStreamWrapper(completion_stream=completion_stream)
-
-
-anthropic_adapter = AnthropicAdapter()
-
-
-class AnthropicStreamWrapper(AdapterCompletionStreamWrapper):
-    """
-    - first chunk return 'message_start'
-    - content block must be started and stopped
-    - finish_reason must map exactly to anthropic reason, else anthropic client won't be able to parse it.
-    """
-
-    sent_first_chunk: bool = False
-    sent_content_block_start: bool = False
-    sent_content_block_finish: bool = False
-    sent_last_message: bool = False
-    holding_chunk: Optional[Any] = None
-
-    def __next__(self):
-        try:
-            if self.sent_first_chunk is False:
-                self.sent_first_chunk = True
-                return {
-                    "type": "message_start",
-                    "message": {
-                        "id": "msg_1nZdL29xx5MUA1yADyHTEsnR8uuvGzszyY",
-                        "type": "message",
-                        "role": "assistant",
-                        "content": [],
-                        "model": "claude-3-5-sonnet-20240620",
-                        "stop_reason": None,
-                        "stop_sequence": None,
-                        "usage": {"input_tokens": 25, "output_tokens": 1},
-                    },
-                }
-            if self.sent_content_block_start is False:
-                self.sent_content_block_start = True
-                return {
-                    "type": "content_block_start",
-                    "index": 0,
-                    "content_block": {"type": "text", "text": ""},
-                }
-
-            for chunk in self.completion_stream:
-                if chunk == "None" or chunk is None:
-                    raise Exception
-
-                processed_chunk = litellm.AnthropicExperimentalPassThroughConfig().translate_streaming_openai_response_to_anthropic(
-                    response=chunk
-                )
-                if (
-                    processed_chunk["type"] == "message_delta"
-                    and self.sent_content_block_finish is False
-                ):
-                    self.holding_chunk = processed_chunk
-                    self.sent_content_block_finish = True
-                    return {
-                        "type": "content_block_stop",
-                        "index": 0,
-                    }
-                elif self.holding_chunk is not None:
-                    return_chunk = self.holding_chunk
-                    self.holding_chunk = processed_chunk
-                    return return_chunk
-                else:
-                    return processed_chunk
-            if self.holding_chunk is not None:
-                return_chunk = self.holding_chunk
-                self.holding_chunk = None
-                return return_chunk
-            if self.sent_last_message is False:
-                self.sent_last_message = True
-                return {"type": "message_stop"}
-            raise StopIteration
-        except StopIteration:
-            if self.sent_last_message is False:
-                self.sent_last_message = True
-                return {"type": "message_stop"}
-            raise StopIteration
-        except Exception as e:
-            verbose_logger.error(
-                "Anthropic Adapter - {}\n{}".format(e, traceback.format_exc())
-            )
-
-    async def __anext__(self):
-        try:
-            if self.sent_first_chunk is False:
-                self.sent_first_chunk = True
-                return {
-                    "type": "message_start",
-                    "message": {
-                        "id": "msg_1nZdL29xx5MUA1yADyHTEsnR8uuvGzszyY",
-                        "type": "message",
-                        "role": "assistant",
-                        "content": [],
-                        "model": "claude-3-5-sonnet-20240620",
-                        "stop_reason": None,
-                        "stop_sequence": None,
-                        "usage": {"input_tokens": 25, "output_tokens": 1},
-                    },
-                }
-            if self.sent_content_block_start is False:
-                self.sent_content_block_start = True
-                return {
-                    "type": "content_block_start",
-                    "index": 0,
-                    "content_block": {"type": "text", "text": ""},
-                }
-            async for chunk in self.completion_stream:
-                if chunk == "None" or chunk is None:
-                    raise Exception
-                processed_chunk = litellm.AnthropicExperimentalPassThroughConfig().translate_streaming_openai_response_to_anthropic(
-                    response=chunk
-                )
-                if (
-                    processed_chunk["type"] == "message_delta"
-                    and self.sent_content_block_finish is False
-                ):
-                    self.holding_chunk = processed_chunk
-                    self.sent_content_block_finish = True
-                    return {
-                        "type": "content_block_stop",
-                        "index": 0,
-                    }
-                elif self.holding_chunk is not None:
-                    return_chunk = self.holding_chunk
-                    self.holding_chunk = processed_chunk
-                    return return_chunk
-                else:
-                    return processed_chunk
-            if self.holding_chunk is not None:
-                return_chunk = self.holding_chunk
-                self.holding_chunk = None
-                return return_chunk
-            if self.sent_last_message is False:
-                self.sent_last_message = True
-                return {"type": "message_stop"}
-            raise StopIteration
-        except StopIteration:
-            if self.sent_last_message is False:
-                self.sent_last_message = True
-                return {"type": "message_stop"}
-            raise StopAsyncIteration
--- a/litellm/assistants/main.py
+++ b/litellm/assistants/main.py
@ -15,6 +15,7 @@ import litellm
 from litellm.types.router import GenericLiteLLMParams
 from litellm.utils import (
    exception_type,
+    get_litellm_params,
    get_llm_provider,
    get_secret,
    supports_httpx_timeout,
@ -86,6 +87,7 @@ def get_assistants(
    optional_params = GenericLiteLLMParams(
        api_key=api_key, api_base=api_base, api_version=api_version, **kwargs
    )
+    litellm_params_dict = get_litellm_params(**kwargs)

    ### TIMEOUT LOGIC ###
    timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
@ -169,6 +171,7 @@ def get_assistants(
            max_retries=optional_params.max_retries,
            client=client,
            aget_assistants=aget_assistants,  # type: ignore
+            litellm_params=litellm_params_dict,
        )
    else:
        raise litellm.exceptions.BadRequestError(
@ -270,6 +273,7 @@ def create_assistants(
    optional_params = GenericLiteLLMParams(
        api_key=api_key, api_base=api_base, api_version=api_version, **kwargs
    )
+    litellm_params_dict = get_litellm_params(**kwargs)

    ### TIMEOUT LOGIC ###
    timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
@ -371,6 +375,7 @@ def create_assistants(
            client=client,
            async_create_assistants=async_create_assistants,
            create_assistant_data=create_assistant_data,
+            litellm_params=litellm_params_dict,
        )
    else:
        raise litellm.exceptions.BadRequestError(
@ -445,6 +450,8 @@ def delete_assistant(
        api_key=api_key, api_base=api_base, api_version=api_version, **kwargs
    )

+    litellm_params_dict = get_litellm_params(**kwargs)
+
    async_delete_assistants: Optional[bool] = kwargs.pop(
        "async_delete_assistants", None
    )
@ -544,6 +551,7 @@ def delete_assistant(
            max_retries=optional_params.max_retries,
            client=client,
            async_delete_assistants=async_delete_assistants,
+            litellm_params=litellm_params_dict,
        )
    else:
        raise litellm.exceptions.BadRequestError(
@ -639,6 +647,7 @@ def create_thread(
    """
    acreate_thread = kwargs.get("acreate_thread", None)
    optional_params = GenericLiteLLMParams(**kwargs)
+    litellm_params_dict = get_litellm_params(**kwargs)

    ### TIMEOUT LOGIC ###
    timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
@ -731,6 +740,7 @@ def create_thread(
            max_retries=optional_params.max_retries,
            client=client,
            acreate_thread=acreate_thread,
+            litellm_params=litellm_params_dict,
        )
    else:
        raise litellm.exceptions.BadRequestError(
@ -795,7 +805,7 @@ def get_thread(
    """Get the thread object, given a thread_id"""
    aget_thread = kwargs.pop("aget_thread", None)
    optional_params = GenericLiteLLMParams(**kwargs)
-
+    litellm_params_dict = get_litellm_params(**kwargs)
    ### TIMEOUT LOGIC ###
    timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
    # set timeout for 10 minutes by default
@ -884,6 +894,7 @@ def get_thread(
            max_retries=optional_params.max_retries,
            client=client,
            aget_thread=aget_thread,
+            litellm_params=litellm_params_dict,
        )
    else:
        raise litellm.exceptions.BadRequestError(
@ -972,6 +983,7 @@ def add_message(
    _message_data = MessageData(
        role=role, content=content, attachments=attachments, metadata=metadata
    )
+    litellm_params_dict = get_litellm_params(**kwargs)
    optional_params = GenericLiteLLMParams(**kwargs)

    message_data = get_optional_params_add_message(
@ -1068,6 +1080,7 @@ def add_message(
            max_retries=optional_params.max_retries,
            client=client,
            a_add_message=a_add_message,
+            litellm_params=litellm_params_dict,
        )
    else:
        raise litellm.exceptions.BadRequestError(
@ -1139,6 +1152,7 @@ def get_messages(
 ) -> SyncCursorPage[OpenAIMessage]:
    aget_messages = kwargs.pop("aget_messages", None)
    optional_params = GenericLiteLLMParams(**kwargs)
+    litellm_params_dict = get_litellm_params(**kwargs)

    ### TIMEOUT LOGIC ###
    timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
@ -1225,6 +1239,7 @@ def get_messages(
            max_retries=optional_params.max_retries,
            client=client,
            aget_messages=aget_messages,
+            litellm_params=litellm_params_dict,
        )
    else:
        raise litellm.exceptions.BadRequestError(
@ -1337,6 +1352,7 @@ def run_thread(
    """Run a given thread + assistant."""
    arun_thread = kwargs.pop("arun_thread", None)
    optional_params = GenericLiteLLMParams(**kwargs)
+    litellm_params_dict = get_litellm_params(**kwargs)

    ### TIMEOUT LOGIC ###
    timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
@ -1437,6 +1453,7 @@ def run_thread(
            max_retries=optional_params.max_retries,
            client=client,
            arun_thread=arun_thread,
+            litellm_params=litellm_params_dict,
        )  # type: ignore
    else:
        raise litellm.exceptions.BadRequestError(
--- a/litellm/batches/batch_utils.py
+++ b/litellm/batches/batch_utils.py
@ -1,76 +1,16 @@
-import asyncio
-import datetime
 import json
-import threading
-from typing import Any, List, Literal, Optional
+from typing import Any, List, Literal, Tuple

 import litellm
 from litellm._logging import verbose_logger
-from litellm.constants import (
-    BATCH_STATUS_POLL_INTERVAL_SECONDS,
-    BATCH_STATUS_POLL_MAX_ATTEMPTS,
-)
-from litellm.files.main import afile_content
-from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
 from litellm.types.llms.openai import Batch
-from litellm.types.utils import StandardLoggingPayload, Usage
-
-
-async def batches_async_logging(
-    batch_id: str,
-    custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
-    logging_obj: Optional[LiteLLMLoggingObj] = None,
-    **kwargs,
-):
-    """
-    Async Job waits for the batch to complete and then logs the completed batch usage - cost, total tokens, prompt tokens, completion tokens
-
-
-    Polls retrieve_batch until it returns a batch with status "completed" or "failed"
-    """
-    from .main import aretrieve_batch
-
-    verbose_logger.debug(
-        ".....in _batches_async_logging... polling retrieve to get batch status"
-    )
-    if logging_obj is None:
-        raise ValueError(
-            "logging_obj is None cannot calculate cost / log batch creation event"
-        )
-    for _ in range(BATCH_STATUS_POLL_MAX_ATTEMPTS):
-        try:
-            start_time = datetime.datetime.now()
-            batch: Batch = await aretrieve_batch(batch_id, custom_llm_provider)
-            verbose_logger.debug(
-                "in _batches_async_logging... batch status= %s", batch.status
-            )
-
-            if batch.status == "completed":
-                end_time = datetime.datetime.now()
-                await _handle_completed_batch(
-                    batch=batch,
-                    custom_llm_provider=custom_llm_provider,
-                    logging_obj=logging_obj,
-                    start_time=start_time,
-                    end_time=end_time,
-                    **kwargs,
-                )
-                break
-            elif batch.status == "failed":
-                pass
-        except Exception as e:
-            verbose_logger.error("error in batches_async_logging", e)
-        await asyncio.sleep(BATCH_STATUS_POLL_INTERVAL_SECONDS)
+from litellm.types.utils import CallTypes, Usage


 async def _handle_completed_batch(
    batch: Batch,
    custom_llm_provider: Literal["openai", "azure", "vertex_ai"],
-    logging_obj: LiteLLMLoggingObj,
-    start_time: datetime.datetime,
-    end_time: datetime.datetime,
-    **kwargs,
-) -> None:
+) -> Tuple[float, Usage, List[str]]:
    """Helper function to process a completed batch and handle logging"""
    # Get batch results
    file_content_dictionary = await _get_batch_output_file_content_as_dictionary(
@ -87,52 +27,25 @@ async def _handle_completed_batch(
        custom_llm_provider=custom_llm_provider,
    )

-    # Handle logging
-    await _log_completed_batch(
-        logging_obj=logging_obj,
-        batch_usage=batch_usage,
-        batch_cost=batch_cost,
-        start_time=start_time,
-        end_time=end_time,
-        **kwargs,
-    )
+    batch_models = _get_batch_models_from_file_content(file_content_dictionary)
+
+    return batch_cost, batch_usage, batch_models


-async def _log_completed_batch(
-    logging_obj: LiteLLMLoggingObj,
-    batch_usage: Usage,
-    batch_cost: float,
-    start_time: datetime.datetime,
-    end_time: datetime.datetime,
-    **kwargs,
-) -> None:
-    """Helper function to handle all logging operations for a completed batch"""
-    logging_obj.call_type = "batch_success"
-
-    standard_logging_object = _create_standard_logging_object_for_completed_batch(
-        kwargs=kwargs,
-        start_time=start_time,
-        end_time=end_time,
-        logging_obj=logging_obj,
-        batch_usage_object=batch_usage,
-        response_cost=batch_cost,
-    )
-
-    logging_obj.model_call_details["standard_logging_object"] = standard_logging_object
-
-    # Launch async and sync logging handlers
-    asyncio.create_task(
-        logging_obj.async_success_handler(
-            result=None,
-            start_time=start_time,
-            end_time=end_time,
-            cache_hit=None,
-        )
-    )
-    threading.Thread(
-        target=logging_obj.success_handler,
-        args=(None, start_time, end_time),
-    ).start()
+def _get_batch_models_from_file_content(
+    file_content_dictionary: List[dict],
+) -> List[str]:
+    """
+    Get the models from the file content
+    """
+    batch_models = []
+    for _item in file_content_dictionary:
+        if _batch_response_was_successful(_item):
+            _response_body = _get_response_from_batch_job_output_file(_item)
+            _model = _response_body.get("model")
+            if _model:
+                batch_models.append(_model)
+    return batch_models


 async def _batch_cost_calculator(
@ -159,6 +72,8 @@ async def _get_batch_output_file_content_as_dictionary(
    """
    Get the batch output file content as a list of dictionaries
    """
+    from litellm.files.main import afile_content
+
    if custom_llm_provider == "vertex_ai":
        raise ValueError("Vertex AI does not support file content retrieval")

@ -208,6 +123,7 @@ def _get_batch_job_cost_from_file_content(
                total_cost += litellm.completion_cost(
                    completion_response=_response_body,
                    custom_llm_provider=custom_llm_provider,
+                    call_type=CallTypes.aretrieve_batch.value,
                )
                verbose_logger.debug("total_cost=%s", total_cost)
        return total_cost
@ -264,30 +180,3 @@ def _batch_response_was_successful(batch_job_output_file: dict) -> bool:
    """
    _response: dict = batch_job_output_file.get("response", None) or {}
    return _response.get("status_code", None) == 200
-
-
-def _create_standard_logging_object_for_completed_batch(
-    kwargs: dict,
-    start_time: datetime.datetime,
-    end_time: datetime.datetime,
-    logging_obj: LiteLLMLoggingObj,
-    batch_usage_object: Usage,
-    response_cost: float,
-) -> StandardLoggingPayload:
-    """
-    Create a standard logging object for a completed batch
-    """
-    standard_logging_object = logging_obj.model_call_details.get(
-        "standard_logging_object", None
-    )
-
-    if standard_logging_object is None:
-        raise ValueError("unable to create standard logging object for completed batch")
-
-    # Add Completed Batch Job Usage and Response Cost
-    standard_logging_object["call_type"] = "batch_success"
-    standard_logging_object["response_cost"] = response_cost
-    standard_logging_object["total_tokens"] = batch_usage_object.total_tokens
-    standard_logging_object["prompt_tokens"] = batch_usage_object.prompt_tokens
-    standard_logging_object["completion_tokens"] = batch_usage_object.completion_tokens
-    return standard_logging_object
--- a/litellm/batches/main.py
+++ b/litellm/batches/main.py
@ -31,10 +31,9 @@ from litellm.types.llms.openai import (
    RetrieveBatchRequest,
 )
 from litellm.types.router import GenericLiteLLMParams
+from litellm.types.utils import LiteLLMBatch
 from litellm.utils import client, get_litellm_params, supports_httpx_timeout

-from .batch_utils import batches_async_logging
-
 ####### ENVIRONMENT VARIABLES ###################
 openai_batches_instance = OpenAIBatchesAPI()
 azure_batches_instance = AzureBatchesAPI()
@ -85,17 +84,6 @@ async def acreate_batch(
        else:
            response = init_response

-        # Start async logging job
-        if response is not None:
-            asyncio.create_task(
-                batches_async_logging(
-                    logging_obj=kwargs.get("litellm_logging_obj", None),
-                    batch_id=response.id,
-                    custom_llm_provider=custom_llm_provider,
-                    **kwargs,
-                )
-            )
-
        return response
    except Exception as e:
        raise e
@ -111,7 +99,7 @@ def create_batch(
    extra_headers: Optional[Dict[str, str]] = None,
    extra_body: Optional[Dict[str, str]] = None,
    **kwargs,
-) -> Union[Batch, Coroutine[Any, Any, Batch]]:
+) -> Union[LiteLLMBatch, Coroutine[Any, Any, LiteLLMBatch]]:
    """
    Creates and executes a batch from an uploaded file of request

@ -119,21 +107,27 @@ def create_batch(
    """
    try:
        optional_params = GenericLiteLLMParams(**kwargs)
+        litellm_call_id = kwargs.get("litellm_call_id", None)
+        proxy_server_request = kwargs.get("proxy_server_request", None)
+        model_info = kwargs.get("model_info", None)
        _is_async = kwargs.pop("acreate_batch", False) is True
+        litellm_params = get_litellm_params(**kwargs)
        litellm_logging_obj: LiteLLMLoggingObj = kwargs.get("litellm_logging_obj", None)
        ### TIMEOUT LOGIC ###
        timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
-        litellm_params = get_litellm_params(
-            custom_llm_provider=custom_llm_provider,
-            litellm_call_id=kwargs.get("litellm_call_id", None),
-            litellm_trace_id=kwargs.get("litellm_trace_id"),
-            litellm_metadata=kwargs.get("litellm_metadata"),
-        )
        litellm_logging_obj.update_environment_variables(
            model=None,
            user=None,
            optional_params=optional_params.model_dump(),
-            litellm_params=litellm_params,
+            litellm_params={
+                "litellm_call_id": litellm_call_id,
+                "proxy_server_request": proxy_server_request,
+                "model_info": model_info,
+                "metadata": metadata,
+                "preset_cache_key": None,
+                "stream_response": {},
+                **optional_params.model_dump(exclude_unset=True),
+            },
            custom_llm_provider=custom_llm_provider,
        )

@ -224,6 +218,7 @@ def create_batch(
                timeout=timeout,
                max_retries=optional_params.max_retries,
                create_batch_data=_create_batch_request,
+                litellm_params=litellm_params,
            )
        elif custom_llm_provider == "vertex_ai":
            api_base = optional_params.api_base or ""
@ -261,7 +256,7 @@ def create_batch(
                response=httpx.Response(
                    status_code=400,
                    content="Unsupported provider",
-                    request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"),  # type: ignore
+                    request=httpx.Request(method="create_batch", url="https://github.com/BerriAI/litellm"),  # type: ignore
                ),
            )
        return response
@ -269,6 +264,7 @@ def create_batch(
        raise e


+@client
 async def aretrieve_batch(
    batch_id: str,
    custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
@ -276,7 +272,7 @@ async def aretrieve_batch(
    extra_headers: Optional[Dict[str, str]] = None,
    extra_body: Optional[Dict[str, str]] = None,
    **kwargs,
-) -> Batch:
+) -> LiteLLMBatch:
    """
    Async: Retrieves a batch.

@ -310,6 +306,7 @@ async def aretrieve_batch(
        raise e


+@client
 def retrieve_batch(
    batch_id: str,
    custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
@ -317,7 +314,7 @@ def retrieve_batch(
    extra_headers: Optional[Dict[str, str]] = None,
    extra_body: Optional[Dict[str, str]] = None,
    **kwargs,
-) -> Union[Batch, Coroutine[Any, Any, Batch]]:
+) -> Union[LiteLLMBatch, Coroutine[Any, Any, LiteLLMBatch]]:
    """
    Retrieves a batch.

@ -325,9 +322,20 @@ def retrieve_batch(
    """
    try:
        optional_params = GenericLiteLLMParams(**kwargs)
+        litellm_logging_obj: LiteLLMLoggingObj = kwargs.get("litellm_logging_obj", None)
        ### TIMEOUT LOGIC ###
        timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
-        # set timeout for 10 minutes by default
+        litellm_params = get_litellm_params(
+            custom_llm_provider=custom_llm_provider,
+            **kwargs,
+        )
+        litellm_logging_obj.update_environment_variables(
+            model=None,
+            user=None,
+            optional_params=optional_params.model_dump(),
+            litellm_params=litellm_params,
+            custom_llm_provider=custom_llm_provider,
+        )

        if (
            timeout is not None
@ -415,6 +423,7 @@ def retrieve_batch(
                timeout=timeout,
                max_retries=optional_params.max_retries,
                retrieve_batch_data=_retrieve_batch_request,
+                litellm_params=litellm_params,
            )
        elif custom_llm_provider == "vertex_ai":
            api_base = optional_params.api_base or ""
@ -517,6 +526,10 @@ def list_batches(
    try:
        # set API KEY
        optional_params = GenericLiteLLMParams(**kwargs)
+        litellm_params = get_litellm_params(
+            custom_llm_provider=custom_llm_provider,
+            **kwargs,
+        )
        api_key = (
            optional_params.api_key
            or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
@ -594,6 +607,7 @@ def list_batches(
                api_version=api_version,
                timeout=timeout,
                max_retries=optional_params.max_retries,
+                litellm_params=litellm_params,
            )
        else:
            raise litellm.exceptions.BadRequestError(
@ -669,6 +683,10 @@ def cancel_batch(
    """
    try:
        optional_params = GenericLiteLLMParams(**kwargs)
+        litellm_params = get_litellm_params(
+            custom_llm_provider=custom_llm_provider,
+            **kwargs,
+        )
        ### TIMEOUT LOGIC ###
        timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
        # set timeout for 10 minutes by default
@ -756,6 +774,7 @@ def cancel_batch(
                timeout=timeout,
                max_retries=optional_params.max_retries,
                cancel_batch_data=_cancel_batch_request,
+                litellm_params=litellm_params,
            )
        else:
            raise litellm.exceptions.BadRequestError(
--- a/litellm/caching/caching_handler.py
+++ b/litellm/caching/caching_handler.py
@ -247,7 +247,6 @@ class LLMCachingHandler:
                    pass
                else:
                    call_type = original_function.__name__
-
                    cached_result = self._convert_cached_result_to_model_response(
                        cached_result=cached_result,
                        call_type=call_type,
@ -725,6 +724,7 @@ class LLMCachingHandler:
        """
        Sync internal method to add the result to the cache
        """
+
        new_kwargs = kwargs.copy()
        new_kwargs.update(
            convert_args_to_kwargs(
@ -738,6 +738,7 @@ class LLMCachingHandler:
        if self._should_store_result_in_cache(
            original_function=self.original_function, kwargs=new_kwargs
        ):
+
            litellm.cache.add_cache(result, **new_kwargs)

        return
--- a/litellm/caching/llm_caching_handler.py
+++ b/litellm/caching/llm_caching_handler.py
@ -0,0 +1,40 @@
+"""
+Add the event loop to the cache key, to prevent event loop closed errors.
+"""
+
+import asyncio
+
+from .in_memory_cache import InMemoryCache
+
+
+class LLMClientCache(InMemoryCache):
+
+    def update_cache_key_with_event_loop(self, key):
+        """
+        Add the event loop to the cache key, to prevent event loop closed errors.
+        If none, use the key as is.
+        """
+        try:
+            event_loop = asyncio.get_event_loop()
+            stringified_event_loop = str(id(event_loop))
+            return f"{key}-{stringified_event_loop}"
+        except Exception:  # handle no current event loop
+            return key
+
+    def set_cache(self, key, value, **kwargs):
+        key = self.update_cache_key_with_event_loop(key)
+        return super().set_cache(key, value, **kwargs)
+
+    async def async_set_cache(self, key, value, **kwargs):
+        key = self.update_cache_key_with_event_loop(key)
+        return await super().async_set_cache(key, value, **kwargs)
+
+    def get_cache(self, key, **kwargs):
+        key = self.update_cache_key_with_event_loop(key)
+
+        return super().get_cache(key, **kwargs)
+
+    async def async_get_cache(self, key, **kwargs):
+        key = self.update_cache_key_with_event_loop(key)
+
+        return await super().async_get_cache(key, **kwargs)
--- a/litellm/constants.py
+++ b/litellm/constants.py
@ -18,6 +18,7 @@ SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000  # Minimum number of requests
 REPEATED_STREAMING_CHUNK_LIMIT = 100  # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.
 #### Networking settings ####
 request_timeout: float = 6000  # time in seconds
+STREAM_SSE_DONE_STRING: str = "[DONE]"

 LITELLM_CHAT_PROVIDERS = [
    "openai",
--- a/litellm/cost_calculator.py
+++ b/litellm/cost_calculator.py
@ -44,7 +44,12 @@ from litellm.llms.vertex_ai.cost_calculator import cost_router as google_cost_ro
 from litellm.llms.vertex_ai.image_generation.cost_calculator import (
    cost_calculator as vertex_ai_image_cost_calculator,
 )
-from litellm.types.llms.openai import HttpxBinaryResponseContent
+from litellm.responses.utils import ResponseAPILoggingUtils
+from litellm.types.llms.openai import (
+    HttpxBinaryResponseContent,
+    ResponseAPIUsage,
+    ResponsesAPIResponse,
+)
 from litellm.types.rerank import RerankBilledUnits, RerankResponse
 from litellm.types.utils import (
    CallTypesLiteral,
@ -239,6 +244,15 @@ def cost_per_token(  # noqa: PLR0915
            custom_llm_provider=custom_llm_provider,
            billed_units=rerank_billed_units,
        )
+    elif (
+        call_type == "aretrieve_batch"
+        or call_type == "retrieve_batch"
+        or call_type == CallTypes.aretrieve_batch
+        or call_type == CallTypes.retrieve_batch
+    ):
+        return batch_cost_calculator(
+            usage=usage_block, model=model, custom_llm_provider=custom_llm_provider
+        )
    elif call_type == "atranscription" or call_type == "transcription":
        return openai_cost_per_second(
            model=model,
@ -399,9 +413,12 @@ def _select_model_name_for_cost_calc(
    if base_model is not None:
        return_model = base_model

-    completion_response_model: Optional[str] = getattr(
-        completion_response, "model", None
-    )
+    completion_response_model: Optional[str] = None
+    if completion_response is not None:
+        if isinstance(completion_response, BaseModel):
+            completion_response_model = getattr(completion_response, "model", None)
+        elif isinstance(completion_response, dict):
+            completion_response_model = completion_response.get("model", None)
    hidden_params: Optional[dict] = getattr(completion_response, "_hidden_params", None)
    if completion_response_model is None and hidden_params is not None:
        if (
@ -452,6 +469,13 @@ def _get_usage_object(
    return usage_obj


+def _is_known_usage_objects(usage_obj):
+    """Returns True if the usage obj is a known Usage type"""
+    return isinstance(usage_obj, litellm.Usage) or isinstance(
+        usage_obj, ResponseAPIUsage
+    )
+
+
 def _infer_call_type(
    call_type: Optional[CallTypesLiteral], completion_response: Any
 ) -> Optional[CallTypesLiteral]:
@ -561,9 +585,7 @@ def completion_cost(  # noqa: PLR0915
            base_model=base_model,
        )

-        verbose_logger.debug(
-            f"completion_response _select_model_name_for_cost_calc: {model}"
-        )
+        verbose_logger.info(f"selected model name for cost calculation: {model}")

        if completion_response is not None and (
            isinstance(completion_response, BaseModel)
@ -575,8 +597,8 @@ def completion_cost(  # noqa: PLR0915
                )
            else:
                usage_obj = getattr(completion_response, "usage", {})
-            if isinstance(usage_obj, BaseModel) and not isinstance(
-                usage_obj, litellm.Usage
+            if isinstance(usage_obj, BaseModel) and not _is_known_usage_objects(
+                usage_obj=usage_obj
            ):
                setattr(
                    completion_response,
@ -589,6 +611,14 @@ def completion_cost(  # noqa: PLR0915
                _usage = usage_obj.model_dump()
            else:
                _usage = usage_obj
+
+            if ResponseAPILoggingUtils._is_response_api_usage(_usage):
+                _usage = (
+                    ResponseAPILoggingUtils._transform_response_api_usage_to_chat_usage(
+                        _usage
+                    ).model_dump()
+                )
+
            # get input/output tokens from completion_response
            prompt_tokens = _usage.get("prompt_tokens", 0)
            completion_tokens = _usage.get("completion_tokens", 0)
@ -787,6 +817,7 @@ def response_cost_calculator(
        TextCompletionResponse,
        HttpxBinaryResponseContent,
        RerankResponse,
+        ResponsesAPIResponse,
    ],
    model: str,
    custom_llm_provider: Optional[str],
@ -957,3 +988,54 @@ def default_image_cost_calculator(
            )

    return cost_info["input_cost_per_pixel"] * height * width * n
+
+
+def batch_cost_calculator(
+    usage: Usage,
+    model: str,
+    custom_llm_provider: Optional[str] = None,
+) -> Tuple[float, float]:
+    """
+    Calculate the cost of a batch job
+    """
+
+    _, custom_llm_provider, _, _ = litellm.get_llm_provider(
+        model=model, custom_llm_provider=custom_llm_provider
+    )
+
+    verbose_logger.info(
+        "Calculating batch cost per token. model=%s, custom_llm_provider=%s",
+        model,
+        custom_llm_provider,
+    )
+
+    try:
+        model_info: Optional[ModelInfo] = litellm.get_model_info(
+            model=model, custom_llm_provider=custom_llm_provider
+        )
+    except Exception:
+        model_info = None
+
+    if not model_info:
+        return 0.0, 0.0
+
+    input_cost_per_token_batches = model_info.get("input_cost_per_token_batches")
+    input_cost_per_token = model_info.get("input_cost_per_token")
+    output_cost_per_token_batches = model_info.get("output_cost_per_token_batches")
+    output_cost_per_token = model_info.get("output_cost_per_token")
+    total_prompt_cost = 0.0
+    total_completion_cost = 0.0
+    if input_cost_per_token_batches:
+        total_prompt_cost = usage.prompt_tokens * input_cost_per_token_batches
+    elif input_cost_per_token:
+        total_prompt_cost = (
+            usage.prompt_tokens * (input_cost_per_token) / 2
+        )  # batch cost is usually half of the regular token cost
+    if output_cost_per_token_batches:
+        total_completion_cost = usage.completion_tokens * output_cost_per_token_batches
+    elif output_cost_per_token:
+        total_completion_cost = (
+            usage.completion_tokens * (output_cost_per_token) / 2
+        )  # batch cost is usually half of the regular token cost
+
+    return total_prompt_cost, total_completion_cost
--- a/litellm/exceptions.py
+++ b/litellm/exceptions.py
@ -118,6 +118,7 @@ class BadRequestError(openai.BadRequestError):  # type: ignore
        litellm_debug_info: Optional[str] = None,
        max_retries: Optional[int] = None,
        num_retries: Optional[int] = None,
+        body: Optional[dict] = None,
    ):
        self.status_code = 400
        self.message = "litellm.BadRequestError: {}".format(message)
@ -133,7 +134,7 @@ class BadRequestError(openai.BadRequestError):  # type: ignore
        self.max_retries = max_retries
        self.num_retries = num_retries
        super().__init__(
-            self.message, response=response, body=None
+            self.message, response=response, body=body
        )  # Call the base class constructor with the parameters it needs

    def __str__(self):
--- a/litellm/files/main.py
+++ b/litellm/files/main.py
@ -25,7 +25,7 @@ from litellm.types.llms.openai import (
    HttpxBinaryResponseContent,
 )
 from litellm.types.router import *
-from litellm.utils import supports_httpx_timeout
+from litellm.utils import get_litellm_params, supports_httpx_timeout

 ####### ENVIRONMENT VARIABLES ###################
 openai_files_instance = OpenAIFilesAPI()
@ -546,6 +546,7 @@ def create_file(
    try:
        _is_async = kwargs.pop("acreate_file", False) is True
        optional_params = GenericLiteLLMParams(**kwargs)
+        litellm_params_dict = get_litellm_params(**kwargs)

        ### TIMEOUT LOGIC ###
        timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
@ -630,6 +631,7 @@ def create_file(
                timeout=timeout,
                max_retries=optional_params.max_retries,
                create_file_data=_create_file_request,
+                litellm_params=litellm_params_dict,
            )
        elif custom_llm_provider == "vertex_ai":
            api_base = optional_params.api_base or ""
@ -816,7 +818,7 @@ def file_content(
            )
        else:
            raise litellm.exceptions.BadRequestError(
-                message="LiteLLM doesn't support {} for 'file_content'. Only 'openai' and 'azure' are supported.".format(
+                message="LiteLLM doesn't support {} for 'custom_llm_provider'. Supported providers are 'openai', 'azure', 'vertex_ai'.".format(
                    custom_llm_provider
                ),
                model="n/a",
--- a/litellm/integrations/athina.py
+++ b/litellm/integrations/athina.py
@ -23,6 +23,9 @@ class AthinaLogger:
            "context",
            "expected_response",
            "user_query",
+            "tags",
+            "user_feedback",
+            "model_options",
            "custom_attributes",
        ]

@ -81,7 +84,6 @@ class AthinaLogger:
                for key in self.additional_keys:
                    if key in metadata:
                        data[key] = metadata[key]
-
            response = litellm.module_level_client.post(
                self.athina_logging_url,
                headers=self.headers,
--- a/litellm/integrations/custom_logger.py
+++ b/litellm/integrations/custom_logger.py
@ -239,6 +239,7 @@ class CustomLogger:  # https://docs.litellm.ai/docs/observability/custom_callbac
            "image_generation",
            "moderation",
            "audio_transcription",
+            "responses",
        ],
    ) -> Any:
        pass
--- a/litellm/litellm_core_utils/core_helpers.py
+++ b/litellm/litellm_core_utils/core_helpers.py
@ -73,8 +73,19 @@ def remove_index_from_tool_calls(
 def get_litellm_metadata_from_kwargs(kwargs: dict):
    """
    Helper to get litellm metadata from all litellm request kwargs
+
+    Return `litellm_metadata` if it exists, otherwise return `metadata`
    """
-    return kwargs.get("litellm_params", {}).get("metadata", {})
+    litellm_params = kwargs.get("litellm_params", {})
+    if litellm_params:
+        metadata = litellm_params.get("metadata", {})
+        litellm_metadata = litellm_params.get("litellm_metadata", {})
+        if litellm_metadata:
+            return litellm_metadata
+        elif metadata:
+            return metadata
+
+    return {}


 # Helper functions used for OTEL logging
--- a/litellm/litellm_core_utils/credential_accessor.py
+++ b/litellm/litellm_core_utils/credential_accessor.py
@ -0,0 +1,34 @@
+"""Utils for accessing credentials."""
+
+from typing import List
+
+import litellm
+from litellm.types.utils import CredentialItem
+
+
+class CredentialAccessor:
+    @staticmethod
+    def get_credential_values(credential_name: str) -> dict:
+        """Safe accessor for credentials."""
+        if not litellm.credential_list:
+            return {}
+        for credential in litellm.credential_list:
+            if credential.credential_name == credential_name:
+                return credential.credential_values.copy()
+        return {}
+
+    @staticmethod
+    def upsert_credentials(credentials: List[CredentialItem]):
+        """Add a credential to the list of credentials."""
+
+        credential_names = [cred.credential_name for cred in litellm.credential_list]
+
+        for credential in credentials:
+            if credential.credential_name in credential_names:
+                # Find and replace the existing credential in the list
+                for i, existing_cred in enumerate(litellm.credential_list):
+                    if existing_cred.credential_name == credential.credential_name:
+                        litellm.credential_list[i] = credential
+                        break
+            else:
+                litellm.credential_list.append(credential)
--- a/litellm/litellm_core_utils/exception_mapping_utils.py
+++ b/litellm/litellm_core_utils/exception_mapping_utils.py
@ -331,6 +331,7 @@ def exception_type(  # type: ignore  # noqa: PLR0915
                        model=model,
                        response=getattr(original_exception, "response", None),
                        litellm_debug_info=extra_information,
+                        body=getattr(original_exception, "body", None),
                    )
                elif (
                    "Web server is returning an unknown error" in error_str
@ -421,6 +422,7 @@ def exception_type(  # type: ignore  # noqa: PLR0915
                            llm_provider=custom_llm_provider,
                            response=getattr(original_exception, "response", None),
                            litellm_debug_info=extra_information,
+                            body=getattr(original_exception, "body", None),
                        )
                    elif original_exception.status_code == 429:
                        exception_mapping_worked = True
@ -1960,6 +1962,7 @@ def exception_type(  # type: ignore  # noqa: PLR0915
                        model=model,
                        litellm_debug_info=extra_information,
                        response=getattr(original_exception, "response", None),
+                        body=getattr(original_exception, "body", None),
                    )
                elif (
                    "The api_key client option must be set either by passing api_key to the client or by setting"
@ -1991,6 +1994,7 @@ def exception_type(  # type: ignore  # noqa: PLR0915
                            model=model,
                            litellm_debug_info=extra_information,
                            response=getattr(original_exception, "response", None),
+                            body=getattr(original_exception, "body", None),
                        )
                    elif original_exception.status_code == 401:
                        exception_mapping_worked = True
--- a/litellm/litellm_core_utils/get_litellm_params.py
+++ b/litellm/litellm_core_utils/get_litellm_params.py
@ -57,6 +57,9 @@ def get_litellm_params(
    prompt_variables: Optional[dict] = None,
    async_call: Optional[bool] = None,
    ssl_verify: Optional[bool] = None,
+    merge_reasoning_content_in_choices: Optional[bool] = None,
+    api_version: Optional[str] = None,
+    max_retries: Optional[int] = None,
    **kwargs,
 ) -> dict:
    litellm_params = {
@ -97,5 +100,15 @@ def get_litellm_params(
        "prompt_variables": prompt_variables,
        "async_call": async_call,
        "ssl_verify": ssl_verify,
+        "merge_reasoning_content_in_choices": merge_reasoning_content_in_choices,
+        "api_version": api_version,
+        "azure_ad_token": kwargs.get("azure_ad_token"),
+        "tenant_id": kwargs.get("tenant_id"),
+        "client_id": kwargs.get("client_id"),
+        "client_secret": kwargs.get("client_secret"),
+        "azure_username": kwargs.get("azure_username"),
+        "azure_password": kwargs.get("azure_password"),
+        "max_retries": max_retries,
+        "timeout": kwargs.get("timeout"),
    }
    return litellm_params
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@ -25,6 +25,7 @@ from litellm import (
    turn_off_message_logging,
 )
 from litellm._logging import _is_debugging_on, verbose_logger
+from litellm.batches.batch_utils import _handle_completed_batch
 from litellm.caching.caching import DualCache, InMemoryCache
 from litellm.caching.caching_handler import LLMCachingHandler
 from litellm.cost_calculator import _select_model_name_for_cost_calc
@ -38,11 +39,14 @@ from litellm.litellm_core_utils.redact_messages import (
    redact_message_input_output_from_custom_logger,
    redact_message_input_output_from_logging,
 )
+from litellm.responses.utils import ResponseAPILoggingUtils
 from litellm.types.llms.openai import (
    AllMessageValues,
    Batch,
    FineTuningJob,
    HttpxBinaryResponseContent,
+    ResponseCompletedEvent,
+    ResponsesAPIResponse,
 )
 from litellm.types.rerank import RerankResponse
 from litellm.types.router import SPECIAL_MODEL_INFO_PARAMS
@ -50,9 +54,11 @@ from litellm.types.utils import (
    CallTypes,
    EmbeddingResponse,
    ImageResponse,
+    LiteLLMBatch,
    LiteLLMLoggingBaseClass,
    ModelResponse,
    ModelResponseStream,
+    RawRequestTypedDict,
    StandardCallbackDynamicParams,
    StandardLoggingAdditionalHeaders,
    StandardLoggingHiddenParams,
@ -203,6 +209,7 @@ class Logging(LiteLLMLoggingBaseClass):
        ] = None,
        applied_guardrails: Optional[List[str]] = None,
        kwargs: Optional[Dict] = None,
+        log_raw_request_response: bool = False,
    ):
        _input: Optional[str] = messages  # save original value of messages
        if messages is not None:
@ -231,6 +238,7 @@ class Logging(LiteLLMLoggingBaseClass):
        self.sync_streaming_chunks: List[Any] = (
            []
        )  # for generating complete stream response
+        self.log_raw_request_response = log_raw_request_response

        # Initialize dynamic callbacks
        self.dynamic_input_callbacks: Optional[
@ -451,6 +459,18 @@ class Logging(LiteLLMLoggingBaseClass):

        return model, messages, non_default_params

+    def _get_raw_request_body(self, data: Optional[Union[dict, str]]) -> dict:
+        if data is None:
+            return {"error": "Received empty dictionary for raw request body"}
+        if isinstance(data, str):
+            try:
+                return json.loads(data)
+            except Exception:
+                return {
+                    "error": "Unable to parse raw request body. Got - {}".format(data)
+                }
+        return data
+
    def _pre_call(self, input, api_key, model=None, additional_args={}):
        """
        Common helper function across the sync + async pre-call function
@ -466,6 +486,7 @@ class Logging(LiteLLMLoggingBaseClass):
            self.model_call_details["model"] = model

    def pre_call(self, input, api_key, model=None, additional_args={}):  # noqa: PLR0915
+
        # Log the exact input to the LLM API
        litellm.error_logs["PRE_CALL"] = locals()
        try:
@ -483,28 +504,54 @@ class Logging(LiteLLMLoggingBaseClass):
                additional_args=additional_args,
            )
            # log raw request to provider (like LangFuse) -- if opted in.
-            if log_raw_request_response is True:
+            if (
+                self.log_raw_request_response is True
+                or log_raw_request_response is True
+            ):
+
                _litellm_params = self.model_call_details.get("litellm_params", {})
                _metadata = _litellm_params.get("metadata", {}) or {}
                try:
                    # [Non-blocking Extra Debug Information in metadata]
-                    if (
-                        turn_off_message_logging is not None
-                        and turn_off_message_logging is True
-                    ):
+                    if turn_off_message_logging is True:
+
                        _metadata["raw_request"] = (
                            "redacted by litellm. \
                            'litellm.turn_off_message_logging=True'"
                        )
                    else:
+
                        curl_command = self._get_request_curl_command(
                            api_base=additional_args.get("api_base", ""),
                            headers=additional_args.get("headers", {}),
                            additional_args=additional_args,
                            data=additional_args.get("complete_input_dict", {}),
                        )
+
                        _metadata["raw_request"] = str(curl_command)
+                        # split up, so it's easier to parse in the UI
+                        self.model_call_details["raw_request_typed_dict"] = (
+                            RawRequestTypedDict(
+                                raw_request_api_base=str(
+                                    additional_args.get("api_base") or ""
+                                ),
+                                raw_request_body=self._get_raw_request_body(
+                                    additional_args.get("complete_input_dict", {})
+                                ),
+                                raw_request_headers=self._get_masked_headers(
+                                    additional_args.get("headers", {}) or {},
+                                    ignore_sensitive_headers=True,
+                                ),
+                                error=None,
+                            )
+                        )
                except Exception as e:
+                    self.model_call_details["raw_request_typed_dict"] = (
+                        RawRequestTypedDict(
+                            error=str(e),
+                        )
+                    )
+                    traceback.print_exc()
                    _metadata["raw_request"] = (
                        "Unable to Log \
                        raw request: {}".format(
@ -637,9 +684,14 @@ class Logging(LiteLLMLoggingBaseClass):
                )
                verbose_logger.debug(f"\033[92m{curl_command}\033[0m\n")

+    def _get_request_body(self, data: dict) -> str:
+        return str(data)
+
    def _get_request_curl_command(
-        self, api_base: str, headers: dict, additional_args: dict, data: dict
+        self, api_base: str, headers: Optional[dict], additional_args: dict, data: dict
    ) -> str:
+        if headers is None:
+            headers = {}
        curl_command = "\n\nPOST Request Sent from LiteLLM:\n"
        curl_command += "curl -X POST \\\n"
        curl_command += f"{api_base} \\\n"
@ -647,11 +699,10 @@ class Logging(LiteLLMLoggingBaseClass):
        formatted_headers = " ".join(
            [f"-H '{k}: {v}'" for k, v in masked_headers.items()]
        )
-
        curl_command += (
            f"{formatted_headers} \\\n" if formatted_headers.strip() != "" else ""
        )
-        curl_command += f"-d '{str(data)}'\n"
+        curl_command += f"-d '{self._get_request_body(data)}'\n"
        if additional_args.get("request_str", None) is not None:
            # print the sagemaker / bedrock client request
            curl_command = "\nRequest Sent from LiteLLM:\n"
@ -660,12 +711,20 @@ class Logging(LiteLLMLoggingBaseClass):
            curl_command = str(self.model_call_details)
        return curl_command

-    def _get_masked_headers(self, headers: dict):
+    def _get_masked_headers(
+        self, headers: dict, ignore_sensitive_headers: bool = False
+    ) -> dict:
        """
        Internal debugging helper function

        Masks the headers of the request sent from LiteLLM
        """
+        sensitive_keywords = [
+            "authorization",
+            "token",
+            "key",
+            "secret",
+        ]
        return {
            k: (
                (v[:-44] + "*" * 44)
@ -673,6 +732,11 @@ class Logging(LiteLLMLoggingBaseClass):
                else "*****"
            )
            for k, v in headers.items()
+            if not ignore_sensitive_headers
+            or not any(
+                sensitive_keyword in k.lower()
+                for sensitive_keyword in sensitive_keywords
+            )
        }

    def post_call(
@ -790,6 +854,8 @@ class Logging(LiteLLMLoggingBaseClass):
            RerankResponse,
            Batch,
            FineTuningJob,
+            ResponsesAPIResponse,
+            ResponseCompletedEvent,
        ],
        cache_hit: Optional[bool] = None,
    ) -> Optional[float]:
@ -871,6 +937,24 @@ class Logging(LiteLLMLoggingBaseClass):

        return None

+    async def _response_cost_calculator_async(
+        self,
+        result: Union[
+            ModelResponse,
+            ModelResponseStream,
+            EmbeddingResponse,
+            ImageResponse,
+            TranscriptionResponse,
+            TextCompletionResponse,
+            HttpxBinaryResponseContent,
+            RerankResponse,
+            Batch,
+            FineTuningJob,
+        ],
+        cache_hit: Optional[bool] = None,
+    ) -> Optional[float]:
+        return self._response_cost_calculator(result=result, cache_hit=cache_hit)
+
    def should_run_callback(
        self, callback: litellm.CALLBACK_TYPES, litellm_params: dict, event_hook: str
    ) -> bool:
@ -912,13 +996,16 @@ class Logging(LiteLLMLoggingBaseClass):
            self.model_call_details["log_event_type"] = "successful_api_call"
            self.model_call_details["end_time"] = end_time
            self.model_call_details["cache_hit"] = cache_hit
+
+            if self.call_type == CallTypes.anthropic_messages.value:
+                result = self._handle_anthropic_messages_response_logging(result=result)
            ## if model in model cost map - log the response cost
            ## else set cost to None
            if (
                standard_logging_object is None
                and result is not None
                and self.stream is not True
-            ):  # handle streaming separately
+            ):
                if (
                    isinstance(result, ModelResponse)
                    or isinstance(result, ModelResponseStream)
@ -928,8 +1015,9 @@ class Logging(LiteLLMLoggingBaseClass):
                    or isinstance(result, TextCompletionResponse)
                    or isinstance(result, HttpxBinaryResponseContent)  # tts
                    or isinstance(result, RerankResponse)
-                    or isinstance(result, Batch)
                    or isinstance(result, FineTuningJob)
+                    or isinstance(result, LiteLLMBatch)
+                    or isinstance(result, ResponsesAPIResponse)
                ):
                    ## HIDDEN PARAMS ##
                    hidden_params = getattr(result, "_hidden_params", {})
@ -1029,7 +1117,7 @@ class Logging(LiteLLMLoggingBaseClass):

            ## BUILD COMPLETE STREAMED RESPONSE
            complete_streaming_response: Optional[
-                Union[ModelResponse, TextCompletionResponse]
+                Union[ModelResponse, TextCompletionResponse, ResponsesAPIResponse]
            ] = None
            if "complete_streaming_response" in self.model_call_details:
                return  # break out of this.
@ -1525,6 +1613,20 @@ class Logging(LiteLLMLoggingBaseClass):
        print_verbose(
            "Logging Details LiteLLM-Async Success Call, cache_hit={}".format(cache_hit)
        )
+
+        ## CALCULATE COST FOR BATCH JOBS
+        if self.call_type == CallTypes.aretrieve_batch.value and isinstance(
+            result, LiteLLMBatch
+        ):
+
+            response_cost, batch_usage, batch_models = await _handle_completed_batch(
+                batch=result, custom_llm_provider=self.custom_llm_provider
+            )
+
+            result._hidden_params["response_cost"] = response_cost
+            result._hidden_params["batch_models"] = batch_models
+            result.usage = batch_usage
+
        start_time, end_time, result = self._success_handler_helper_fn(
            start_time=start_time,
            end_time=end_time,
@ -1532,11 +1634,12 @@ class Logging(LiteLLMLoggingBaseClass):
            cache_hit=cache_hit,
            standard_logging_object=kwargs.get("standard_logging_object", None),
        )
+
        ## BUILD COMPLETE STREAMED RESPONSE
        if "async_complete_streaming_response" in self.model_call_details:
            return  # break out of this.
        complete_streaming_response: Optional[
-            Union[ModelResponse, TextCompletionResponse]
+            Union[ModelResponse, TextCompletionResponse, ResponsesAPIResponse]
        ] = self._get_assembled_streaming_response(
            result=result,
            start_time=start_time,
@ -2246,16 +2349,24 @@ class Logging(LiteLLMLoggingBaseClass):

    def _get_assembled_streaming_response(
        self,
-        result: Union[ModelResponse, TextCompletionResponse, ModelResponseStream, Any],
+        result: Union[
+            ModelResponse,
+            TextCompletionResponse,
+            ModelResponseStream,
+            ResponseCompletedEvent,
+            Any,
+        ],
        start_time: datetime.datetime,
        end_time: datetime.datetime,
        is_async: bool,
        streaming_chunks: List[Any],
-    ) -> Optional[Union[ModelResponse, TextCompletionResponse]]:
+    ) -> Optional[Union[ModelResponse, TextCompletionResponse, ResponsesAPIResponse]]:
        if isinstance(result, ModelResponse):
            return result
        elif isinstance(result, TextCompletionResponse):
            return result
+        elif isinstance(result, ResponseCompletedEvent):
+            return result.response
        elif isinstance(result, ModelResponseStream):
            complete_streaming_response: Optional[
                Union[ModelResponse, TextCompletionResponse]
@ -2270,6 +2381,37 @@ class Logging(LiteLLMLoggingBaseClass):
            return complete_streaming_response
        return None

+    def _handle_anthropic_messages_response_logging(self, result: Any) -> ModelResponse:
+        """
+        Handles logging for Anthropic messages responses.
+
+        Args:
+            result: The response object from the model call
+
+        Returns:
+            The the response object from the model call
+
+        - For Non-streaming responses, we need to transform the response to a ModelResponse object.
+        - For streaming responses, anthropic_messages handler calls success_handler with a assembled ModelResponse.
+        """
+        if self.stream and isinstance(result, ModelResponse):
+            return result
+
+        result = litellm.AnthropicConfig().transform_response(
+            raw_response=self.model_call_details["httpx_response"],
+            model_response=litellm.ModelResponse(),
+            model=self.model,
+            messages=[],
+            logging_obj=self,
+            optional_params={},
+            api_key="",
+            request_data={},
+            encoding=litellm.encoding,
+            json_mode=False,
+            litellm_params={},
+        )
+        return result
+

 def set_callbacks(callback_list, function_id=None):  # noqa: PLR0915
    """
@ -2983,6 +3125,12 @@ class StandardLoggingPayloadSetup:
        elif isinstance(usage, Usage):
            return usage
        elif isinstance(usage, dict):
+            if ResponseAPILoggingUtils._is_response_api_usage(usage):
+                return (
+                    ResponseAPILoggingUtils._transform_response_api_usage_to_chat_usage(
+                        usage
+                    )
+                )
            return Usage(**usage)

        raise ValueError(f"usage is required, got={usage} of type {type(usage)}")
@ -3086,6 +3234,7 @@ class StandardLoggingPayloadSetup:
            response_cost=None,
            additional_headers=None,
            litellm_overhead_time_ms=None,
+            batch_models=None,
        )
        if hidden_params is not None:
            for key in StandardLoggingHiddenParams.__annotations__.keys():
@ -3199,6 +3348,7 @@ def get_standard_logging_object_payload(
                        api_base=None,
                        response_cost=None,
                        litellm_overhead_time_ms=None,
+                        batch_models=None,
                    )
                )

@ -3483,6 +3633,7 @@ def create_dummy_standard_logging_payload() -> StandardLoggingPayload:
        response_cost=None,
        additional_headers=None,
        litellm_overhead_time_ms=None,
+        batch_models=None,
    )

    # Convert numeric values to appropriate types
--- a/litellm/litellm_core_utils/llm_response_utils/convert_dict_to_response.py
+++ b/litellm/litellm_core_utils/llm_response_utils/convert_dict_to_response.py
@ -9,6 +9,7 @@ from typing import Dict, Iterable, List, Literal, Optional, Tuple, Union
 import litellm
 from litellm._logging import verbose_logger
 from litellm.constants import RESPONSE_FORMAT_TOOL_NAME
+from litellm.types.llms.openai import ChatCompletionThinkingBlock
 from litellm.types.utils import (
    ChatCompletionDeltaToolCall,
    ChatCompletionMessageToolCall,
@ -128,12 +129,7 @@ def convert_to_streaming_response(response_object: Optional[dict] = None):
    model_response_object = ModelResponse(stream=True)
    choice_list = []
    for idx, choice in enumerate(response_object["choices"]):
-        delta = Delta(
-            content=choice["message"].get("content", None),
-            role=choice["message"]["role"],
-            function_call=choice["message"].get("function_call", None),
-            tool_calls=choice["message"].get("tool_calls", None),
-        )
+        delta = Delta(**choice["message"])
        finish_reason = choice.get("finish_reason", None)
        if finish_reason is None:
            # gpt-4 vision can return 'finish_reason' or 'finish_details'
@ -243,6 +239,24 @@ def _parse_content_for_reasoning(
    return None, message_text


+def _extract_reasoning_content(message: dict) -> Tuple[Optional[str], Optional[str]]:
+    """
+    Extract reasoning content and main content from a message.
+
+    Args:
+        message (dict): The message dictionary that may contain reasoning_content
+
+    Returns:
+        tuple[Optional[str], Optional[str]]: A tuple of (reasoning_content, content)
+    """
+    if "reasoning_content" in message:
+        return message["reasoning_content"], message["content"]
+    elif "reasoning" in message:
+        return message["reasoning"], message["content"]
+    else:
+        return _parse_content_for_reasoning(message.get("content"))
+
+
 class LiteLLMResponseObjectHandler:

    @staticmethod
@ -456,11 +470,16 @@ def convert_to_model_response_object(  # noqa: PLR0915
                            provider_specific_fields[field] = choice["message"][field]

                    # Handle reasoning models that display `reasoning_content` within `content`
-
-                    reasoning_content, content = _parse_content_for_reasoning(
-                        choice["message"].get("content")
+                    reasoning_content, content = _extract_reasoning_content(
+                        choice["message"]
                    )

+                    # Handle thinking models that display `thinking_blocks` within `content`
+                    thinking_blocks: Optional[List[ChatCompletionThinkingBlock]] = None
+                    if "thinking_blocks" in choice["message"]:
+                        thinking_blocks = choice["message"]["thinking_blocks"]
+                        provider_specific_fields["thinking_blocks"] = thinking_blocks
+
                    if reasoning_content:
                        provider_specific_fields["reasoning_content"] = (
                            reasoning_content
@ -474,6 +493,7 @@ def convert_to_model_response_object(  # noqa: PLR0915
                        audio=choice["message"].get("audio", None),
                        provider_specific_fields=provider_specific_fields,
                        reasoning_content=reasoning_content,
+                        thinking_blocks=thinking_blocks,
                    )
                    finish_reason = choice.get("finish_reason", None)
                if finish_reason is None:
--- a/litellm/litellm_core_utils/prompt_templates/factory.py
+++ b/litellm/litellm_core_utils/prompt_templates/factory.py
@ -187,53 +187,125 @@ def ollama_pt(
            final_prompt_value="### Response:",
            messages=messages,
        )
-    elif "llava" in model:
-        prompt = ""
-        images = []
-        for message in messages:
-            if isinstance(message["content"], str):
-                prompt += message["content"]
-            elif isinstance(message["content"], list):
-                # see https://docs.litellm.ai/docs/providers/openai#openai-vision-models
-                for element in message["content"]:
-                    if isinstance(element, dict):
-                        if element["type"] == "text":
-                            prompt += element["text"]
-                        elif element["type"] == "image_url":
-                            base64_image = convert_to_ollama_image(
-                                element["image_url"]["url"]
-                            )
-                            images.append(base64_image)
-        return {"prompt": prompt, "images": images}
    else:
+        user_message_types = {"user", "tool", "function"}
+        msg_i = 0
+        images = []
        prompt = ""
-        for message in messages:
-            role = message["role"]
-            content = message.get("content", "")
+        while msg_i < len(messages):
+            init_msg_i = msg_i
+            user_content_str = ""
+            ## MERGE CONSECUTIVE USER CONTENT ##
+            while (
+                msg_i < len(messages) and messages[msg_i]["role"] in user_message_types
+            ):
+                msg_content = messages[msg_i].get("content")
+                if msg_content:
+                    if isinstance(msg_content, list):
+                        for m in msg_content:
+                            if m.get("type", "") == "image_url":
+                                if isinstance(m["image_url"], str):
+                                    images.append(m["image_url"])
+                                elif isinstance(m["image_url"], dict):
+                                    images.append(m["image_url"]["url"])
+                            elif m.get("type", "") == "text":
+                                user_content_str += m["text"]
+                    else:
+                        # Tool message content will always be a string
+                        user_content_str += msg_content

-            if "tool_calls" in message:
-                tool_calls = []
+                msg_i += 1

-                for call in message["tool_calls"]:
-                    call_id: str = call["id"]
-                    function_name: str = call["function"]["name"]
-                    arguments = json.loads(call["function"]["arguments"])
+            if user_content_str:
+                prompt += f"### User:\n{user_content_str}\n\n"

-                    tool_calls.append(
-                        {
-                            "id": call_id,
-                            "type": "function",
-                            "function": {"name": function_name, "arguments": arguments},
-                        }
+            assistant_content_str = ""
+            ## MERGE CONSECUTIVE ASSISTANT CONTENT ##
+            while msg_i < len(messages) and messages[msg_i]["role"] == "assistant":
+                msg_content = messages[msg_i].get("content")
+                if msg_content:
+                    if isinstance(msg_content, list):
+                        for m in msg_content:
+                            if m.get("type", "") == "text":
+                                assistant_content_str += m["text"]
+                    elif isinstance(msg_content, str):
+                        # Tool message content will always be a string
+                        assistant_content_str += msg_content
+
+                tool_calls = messages[msg_i].get("tool_calls")
+                ollama_tool_calls = []
+                if tool_calls:
+                    for call in tool_calls:
+                        call_id: str = call["id"]
+                        function_name: str = call["function"]["name"]
+                        arguments = json.loads(call["function"]["arguments"])
+
+                        ollama_tool_calls.append(
+                            {
+                                "id": call_id,
+                                "type": "function",
+                                "function": {
+                                    "name": function_name,
+                                    "arguments": arguments,
+                                },
+                            }
+                        )
+
+                if ollama_tool_calls:
+                    assistant_content_str += (
+                        f"Tool Calls: {json.dumps(ollama_tool_calls, indent=2)}"
                    )

-                prompt += f"### Assistant:\nTool Calls: {json.dumps(tool_calls, indent=2)}\n\n"
+                msg_i += 1

-            elif "tool_call_id" in message:
-                prompt += f"### User:\n{message['content']}\n\n"
+            if assistant_content_str:
+                prompt += f"### Assistant:\n{assistant_content_str}\n\n"

-            elif content:
-                prompt += f"### {role.capitalize()}:\n{content}\n\n"
+            if msg_i == init_msg_i:  # prevent infinite loops
+                raise litellm.BadRequestError(
+                    message=BAD_MESSAGE_ERROR_STR + f"passed in {messages[msg_i]}",
+                    model=model,
+                    llm_provider="ollama",
+                )
+        # prompt = ""
+        # images = []
+        # for message in messages:
+        #     if isinstance(message["content"], str):
+        #         prompt += message["content"]
+        #     elif isinstance(message["content"], list):
+        #         # see https://docs.litellm.ai/docs/providers/openai#openai-vision-models
+        #         for element in message["content"]:
+        #             if isinstance(element, dict):
+        #                 if element["type"] == "text":
+        #                     prompt += element["text"]
+        #                 elif element["type"] == "image_url":
+        #                     base64_image = convert_to_ollama_image(
+        #                         element["image_url"]["url"]
+        #                     )
+        #                     images.append(base64_image)
+
+        #     if "tool_calls" in message:
+        #         tool_calls = []
+
+        #         for call in message["tool_calls"]:
+        #             call_id: str = call["id"]
+        #             function_name: str = call["function"]["name"]
+        #             arguments = json.loads(call["function"]["arguments"])
+
+        #             tool_calls.append(
+        #                 {
+        #                     "id": call_id,
+        #                     "type": "function",
+        #                     "function": {"name": function_name, "arguments": arguments},
+        #                 }
+        #             )
+
+        #         prompt += f"### Assistant:\nTool Calls: {json.dumps(tool_calls, indent=2)}\n\n"
+
+        #     elif "tool_call_id" in message:
+        #         prompt += f"### User:\n{message['content']}\n\n"
+
+        return {"prompt": prompt, "images": images}

    return prompt

@ -680,12 +752,13 @@ def convert_generic_image_chunk_to_openai_image_obj(
    Return:
    "data:image/jpeg;base64,{base64_image}"
    """
-    return "data:{};{},{}".format(
-        image_chunk["media_type"], image_chunk["type"], image_chunk["data"]
-    )
+    media_type = image_chunk["media_type"]
+    return "data:{};{},{}".format(media_type, image_chunk["type"], image_chunk["data"])


-def convert_to_anthropic_image_obj(openai_image_url: str) -> GenericImageParsingChunk:
+def convert_to_anthropic_image_obj(
+    openai_image_url: str, format: Optional[str]
+) -> GenericImageParsingChunk:
    """
    Input:
    "image_url": "data:image/jpeg;base64,{base64_image}",
@ -702,7 +775,11 @@ def convert_to_anthropic_image_obj(openai_image_url: str) -> GenericImageParsing
            openai_image_url = convert_url_to_base64(url=openai_image_url)
        # Extract the media type and base64 data
        media_type, base64_data = openai_image_url.split("data:")[1].split(";base64,")
-        media_type = media_type.replace("\\/", "/")
+
+        if format:
+            media_type = format
+        else:
+            media_type = media_type.replace("\\/", "/")

        return GenericImageParsingChunk(
            type="base64",
@ -820,11 +897,12 @@ def anthropic_messages_pt_xml(messages: list):
            if isinstance(messages[msg_i]["content"], list):
                for m in messages[msg_i]["content"]:
                    if m.get("type", "") == "image_url":
+                        format = m["image_url"].get("format")
                        user_content.append(
                            {
                                "type": "image",
                                "source": convert_to_anthropic_image_obj(
-                                    m["image_url"]["url"]
+                                    m["image_url"]["url"], format=format
                                ),
                            }
                        )
@ -1156,10 +1234,13 @@ def convert_to_anthropic_tool_result(
                )
            elif content["type"] == "image_url":
                if isinstance(content["image_url"], str):
-                    image_chunk = convert_to_anthropic_image_obj(content["image_url"])
-                else:
                    image_chunk = convert_to_anthropic_image_obj(
-                        content["image_url"]["url"]
+                        content["image_url"], format=None
+                    )
+                else:
+                    format = content["image_url"].get("format")
+                    image_chunk = convert_to_anthropic_image_obj(
+                        content["image_url"]["url"], format=format
                    )
                anthropic_content_list.append(
                    AnthropicMessagesImageParam(
@ -1282,6 +1363,7 @@ def add_cache_control_to_content(
        AnthropicMessagesImageParam,
        AnthropicMessagesTextParam,
        AnthropicMessagesDocumentParam,
+        ChatCompletionThinkingBlock,
    ],
    orignal_content_element: Union[dict, AllMessageValues],
 ):
@ -1317,6 +1399,7 @@ def _anthropic_content_element_factory(
                data=image_chunk["data"],
            ),
        )
+
    return _anthropic_content_element


@ -1368,13 +1451,16 @@ def anthropic_messages_pt(  # noqa: PLR0915
                    for m in user_message_types_block["content"]:
                        if m.get("type", "") == "image_url":
                            m = cast(ChatCompletionImageObject, m)
+                            format: Optional[str] = None
                            if isinstance(m["image_url"], str):
                                image_chunk = convert_to_anthropic_image_obj(
-                                    openai_image_url=m["image_url"]
+                                    openai_image_url=m["image_url"], format=None
                                )
                            else:
+                                format = m["image_url"].get("format")
                                image_chunk = convert_to_anthropic_image_obj(
-                                    openai_image_url=m["image_url"]["url"]
+                                    openai_image_url=m["image_url"]["url"],
+                                    format=format,
                                )

                            _anthropic_content_element = (
@ -1454,12 +1540,23 @@ def anthropic_messages_pt(  # noqa: PLR0915
                assistant_content_block["content"], list
            ):
                for m in assistant_content_block["content"]:
-                    # handle text
+                    # handle thinking blocks
+                    thinking_block = cast(str, m.get("thinking", ""))
+                    text_block = cast(str, m.get("text", ""))
                    if (
-                        m.get("type", "") == "text" and len(m.get("text", "")) > 0
+                        m.get("type", "") == "thinking" and len(thinking_block) > 0
+                    ):  # don't pass empty text blocks. anthropic api raises errors.
+                        anthropic_message: Union[
+                            ChatCompletionThinkingBlock,
+                            AnthropicMessagesTextParam,
+                        ] = cast(ChatCompletionThinkingBlock, m)
+                        assistant_content.append(anthropic_message)
+                    # handle text
+                    elif (
+                        m.get("type", "") == "text" and len(text_block) > 0
                    ):  # don't pass empty text blocks. anthropic api raises errors.
                        anthropic_message = AnthropicMessagesTextParam(
-                            type="text", text=m.get("text")
+                            type="text", text=text_block
                        )
                        _cached_message = add_cache_control_to_content(
                            anthropic_content_element=anthropic_message,
@ -1512,6 +1609,7 @@ def anthropic_messages_pt(  # noqa: PLR0915
            msg_i += 1

        if assistant_content:
+
            new_messages.append({"role": "assistant", "content": assistant_content})

        if msg_i == init_msg_i:  # prevent infinite loops
@ -1520,17 +1618,6 @@ def anthropic_messages_pt(  # noqa: PLR0915
                model=model,
                llm_provider=llm_provider,
            )
-    if not new_messages or new_messages[0]["role"] != "user":
-        if litellm.modify_params:
-            new_messages.insert(
-                0, {"role": "user", "content": [{"type": "text", "text": "."}]}
-            )
-        else:
-            raise Exception(
-                "Invalid first message={}. Should always start with 'role'='user' for Anthropic. System prompt is sent separately for Anthropic. set 'litellm.modify_params = True' or 'litellm_settings:modify_params = True' on proxy, to insert a placeholder user message - '.' as the first message, ".format(
-                    new_messages
-                )
-            )

    if new_messages[-1]["role"] == "assistant":
        if isinstance(new_messages[-1]["content"], str):
@ -2301,8 +2388,11 @@ class BedrockImageProcessor:
            )

    @classmethod
-    def process_image_sync(cls, image_url: str) -> BedrockContentBlock:
+    def process_image_sync(
+        cls, image_url: str, format: Optional[str] = None
+    ) -> BedrockContentBlock:
        """Synchronous image processing."""
+
        if "base64" in image_url:
            img_bytes, mime_type, image_format = cls._parse_base64_image(image_url)
        elif "http://" in image_url or "https://" in image_url:
@ -2313,11 +2403,17 @@ class BedrockImageProcessor:
                "Unsupported image type. Expected either image url or base64 encoded string"
            )

+        if format:
+            mime_type = format
+            image_format = mime_type.split("/")[1]
+
        image_format = cls._validate_format(mime_type, image_format)
        return cls._create_bedrock_block(img_bytes, mime_type, image_format)

    @classmethod
-    async def process_image_async(cls, image_url: str) -> BedrockContentBlock:
+    async def process_image_async(
+        cls, image_url: str, format: Optional[str]
+    ) -> BedrockContentBlock:
        """Asynchronous image processing."""

        if "base64" in image_url:
@ -2332,6 +2428,10 @@ class BedrockImageProcessor:
                "Unsupported image type. Expected either image url or base64 encoded string"
            )

+        if format:  # override with user-defined params
+            mime_type = format
+            image_format = mime_type.split("/")[1]
+
        image_format = cls._validate_format(mime_type, image_format)
        return cls._create_bedrock_block(img_bytes, mime_type, image_format)

@ -2819,12 +2919,14 @@ class BedrockConverseMessagesProcessor:
                                _part = BedrockContentBlock(text=element["text"])
                                _parts.append(_part)
                            elif element["type"] == "image_url":
+                                format: Optional[str] = None
                                if isinstance(element["image_url"], dict):
                                    image_url = element["image_url"]["url"]
+                                    format = element["image_url"].get("format")
                                else:
                                    image_url = element["image_url"]
                                _part = await BedrockImageProcessor.process_image_async(  # type: ignore
-                                    image_url=image_url
+                                    image_url=image_url, format=format
                                )
                                _parts.append(_part)  # type: ignore
                            _cache_point_block = (
@ -2924,7 +3026,14 @@ class BedrockConverseMessagesProcessor:
                    assistants_parts: List[BedrockContentBlock] = []
                    for element in _assistant_content:
                        if isinstance(element, dict):
-                            if element["type"] == "text":
+                            if element["type"] == "thinking":
+                                thinking_block = BedrockConverseMessagesProcessor.translate_thinking_blocks_to_reasoning_content_blocks(
+                                    thinking_blocks=[
+                                        cast(ChatCompletionThinkingBlock, element)
+                                    ]
+                                )
+                                assistants_parts.extend(thinking_block)
+                            elif element["type"] == "text":
                                assistants_part = BedrockContentBlock(
                                    text=element["text"]
                                )
@ -2974,7 +3083,7 @@ class BedrockConverseMessagesProcessor:
        reasoning_content_blocks: List[BedrockContentBlock] = []
        for thinking_block in thinking_blocks:
            reasoning_text = thinking_block.get("thinking")
-            reasoning_signature = thinking_block.get("signature_delta")
+            reasoning_signature = thinking_block.get("signature")
            text_block = BedrockConverseReasoningTextBlock(
                text=reasoning_text or "",
            )
@ -3050,12 +3159,15 @@ def _bedrock_converse_messages_pt(  # noqa: PLR0915
                            _part = BedrockContentBlock(text=element["text"])
                            _parts.append(_part)
                        elif element["type"] == "image_url":
+                            format: Optional[str] = None
                            if isinstance(element["image_url"], dict):
                                image_url = element["image_url"]["url"]
+                                format = element["image_url"].get("format")
                            else:
                                image_url = element["image_url"]
                            _part = BedrockImageProcessor.process_image_sync(  # type: ignore
-                                image_url=image_url
+                                image_url=image_url,
+                                format=format,
                            )
                            _parts.append(_part)  # type: ignore
                        _cache_point_block = (
@ -3157,7 +3269,14 @@ def _bedrock_converse_messages_pt(  # noqa: PLR0915
                assistants_parts: List[BedrockContentBlock] = []
                for element in _assistant_content:
                    if isinstance(element, dict):
-                        if element["type"] == "text":
+                        if element["type"] == "thinking":
+                            thinking_block = BedrockConverseMessagesProcessor.translate_thinking_blocks_to_reasoning_content_blocks(
+                                thinking_blocks=[
+                                    cast(ChatCompletionThinkingBlock, element)
+                                ]
+                            )
+                            assistants_parts.extend(thinking_block)
+                        elif element["type"] == "text":
                            assistants_part = BedrockContentBlock(text=element["text"])
                            assistants_parts.append(assistants_part)
                        elif element["type"] == "image_url":
--- a/litellm/litellm_core_utils/streaming_handler.py
+++ b/litellm/litellm_core_utils/streaming_handler.py
@ -15,6 +15,7 @@ from litellm import verbose_logger
 from litellm.litellm_core_utils.redact_messages import LiteLLMLoggingObject
 from litellm.litellm_core_utils.thread_pool_executor import executor
 from litellm.types.llms.openai import ChatCompletionChunk
+from litellm.types.router import GenericLiteLLMParams
 from litellm.types.utils import Delta
 from litellm.types.utils import GenericStreamingChunk as GChunk
 from litellm.types.utils import (
@ -70,6 +71,17 @@ class CustomStreamWrapper:
        self.completion_stream = completion_stream
        self.sent_first_chunk = False
        self.sent_last_chunk = False
+
+        litellm_params: GenericLiteLLMParams = GenericLiteLLMParams(
+            **self.logging_obj.model_call_details.get("litellm_params", {})
+        )
+        self.merge_reasoning_content_in_choices: bool = (
+            litellm_params.merge_reasoning_content_in_choices or False
+        )
+        self.sent_first_thinking_block = False
+        self.sent_last_thinking_block = False
+        self.thinking_content = ""
+
        self.system_fingerprint: Optional[str] = None
        self.received_finish_reason: Optional[str] = None
        self.intermittent_finish_reason: Optional[str] = (
@ -87,12 +99,7 @@ class CustomStreamWrapper:
        self.holding_chunk = ""
        self.complete_response = ""
        self.response_uptil_now = ""
-        _model_info = (
-            self.logging_obj.model_call_details.get("litellm_params", {}).get(
-                "model_info", {}
-            )
-            or {}
-        )
+        _model_info: Dict = litellm_params.model_info or {}

        _api_base = get_api_base(
            model=model or "",
@ -630,7 +637,10 @@ class CustomStreamWrapper:
                if isinstance(chunk, bytes):
                    chunk = chunk.decode("utf-8")
                if "text_output" in chunk:
-                    response = chunk.replace("data: ", "").strip()
+                    response = (
+                        CustomStreamWrapper._strip_sse_data_from_chunk(chunk) or ""
+                    )
+                    response = response.strip()
                    parsed_response = json.loads(response)
                else:
                    return {
@ -873,6 +883,10 @@ class CustomStreamWrapper:
                    _index: Optional[int] = completion_obj.get("index")
                    if _index is not None:
                        model_response.choices[0].index = _index
+
+                self._optional_combine_thinking_block_in_choices(
+                    model_response=model_response
+                )
                print_verbose(f"returning model_response: {model_response}")
                return model_response
            else:
@ -929,6 +943,48 @@ class CustomStreamWrapper:
                self.chunks.append(model_response)
            return

+    def _optional_combine_thinking_block_in_choices(
+        self, model_response: ModelResponseStream
+    ) -> None:
+        """
+        UI's Like OpenWebUI expect to get 1 chunk with <think>...</think> tags in the chunk content
+
+        In place updates the model_response object with reasoning_content in content with <think>...</think> tags
+
+        Enabled when `merge_reasoning_content_in_choices=True` passed in request params
+
+
+        """
+        if self.merge_reasoning_content_in_choices is True:
+            reasoning_content = getattr(
+                model_response.choices[0].delta, "reasoning_content", None
+            )
+            if reasoning_content:
+                if self.sent_first_thinking_block is False:
+                    model_response.choices[0].delta.content += (
+                        "<think>" + reasoning_content
+                    )
+                    self.sent_first_thinking_block = True
+                elif (
+                    self.sent_first_thinking_block is True
+                    and hasattr(model_response.choices[0].delta, "reasoning_content")
+                    and model_response.choices[0].delta.reasoning_content
+                ):
+                    model_response.choices[0].delta.content = reasoning_content
+            elif (
+                self.sent_first_thinking_block is True
+                and not self.sent_last_thinking_block
+                and model_response.choices[0].delta.content
+            ):
+                model_response.choices[0].delta.content = (
+                    "</think>" + model_response.choices[0].delta.content
+                )
+                self.sent_last_thinking_block = True
+
+            if hasattr(model_response.choices[0].delta, "reasoning_content"):
+                del model_response.choices[0].delta.reasoning_content
+        return
+
    def chunk_creator(self, chunk: Any):  # type: ignore  # noqa: PLR0915
        model_response = self.model_response_creator()
        response_obj: Dict[str, Any] = {}
@ -1775,6 +1831,42 @@ class CustomStreamWrapper:
                extra_kwargs={},
            )

+    @staticmethod
+    def _strip_sse_data_from_chunk(chunk: Optional[str]) -> Optional[str]:
+        """
+        Strips the 'data: ' prefix from Server-Sent Events (SSE) chunks.
+
+        Some providers like sagemaker send it as `data:`, need to handle both
+
+        SSE messages are prefixed with 'data: ' which is part of the protocol,
+        not the actual content from the LLM. This method removes that prefix
+        and returns the actual content.
+
+        Args:
+            chunk: The SSE chunk that may contain the 'data: ' prefix (string or bytes)
+
+        Returns:
+            The chunk with the 'data: ' prefix removed, or the original chunk
+            if no prefix was found. Returns None if input is None.
+
+        See OpenAI Python Ref for this: https://github.com/openai/openai-python/blob/041bf5a8ec54da19aad0169671793c2078bd6173/openai/api_requestor.py#L100
+        """
+        if chunk is None:
+            return None
+
+        if isinstance(chunk, str):
+            # OpenAI sends `data: `
+            if chunk.startswith("data: "):
+                # Strip the prefix and any leading whitespace that might follow it
+                _length_of_sse_data_prefix = len("data: ")
+                return chunk[_length_of_sse_data_prefix:]
+            elif chunk.startswith("data:"):
+                # Sagemaker sends `data:`, no trailing whitespace
+                _length_of_sse_data_prefix = len("data:")
+                return chunk[_length_of_sse_data_prefix:]
+
+        return chunk
+

 def calculate_total_usage(chunks: List[ModelResponse]) -> Usage:
    """Assume most recent usage chunk has total usage uptil then."""
--- a/litellm/llms/aiohttp_openai/chat/transformation.py
+++ b/litellm/llms/aiohttp_openai/chat/transformation.py
@ -29,6 +29,7 @@ class AiohttpOpenAIChatConfig(OpenAILikeChatConfig):
        api_base: Optional[str],
        model: str,
        optional_params: dict,
+        litellm_params: dict,
        stream: Optional[bool] = None,
    ) -> str:
        """
--- a/litellm/llms/anthropic/chat/handler.py
+++ b/litellm/llms/anthropic/chat/handler.py
@ -474,7 +474,10 @@ class ModelResponseIterator:
        if len(self.content_blocks) == 0:
            return False

-        if self.content_blocks[0]["delta"]["type"] == "text_delta":
+        if (
+            self.content_blocks[0]["delta"]["type"] == "text_delta"
+            or self.content_blocks[0]["delta"]["type"] == "thinking_delta"
+        ):
            return False

        for block in self.content_blocks:
@ -527,6 +530,7 @@ class ModelResponseIterator:
        provider_specific_fields = {}
        content_block = ContentBlockDelta(**chunk)  # type: ignore
        thinking_blocks: List[ChatCompletionThinkingBlock] = []
+
        self.content_blocks.append(content_block)
        if "text" in content_block["delta"]:
            text = content_block["delta"]["text"]
@ -544,13 +548,13 @@ class ModelResponseIterator:
            provider_specific_fields["citation"] = content_block["delta"]["citation"]
        elif (
            "thinking" in content_block["delta"]
-            or "signature_delta" == content_block["delta"]
+            or "signature" in content_block["delta"]
        ):
            thinking_blocks = [
                ChatCompletionThinkingBlock(
                    type="thinking",
-                    thinking=content_block["delta"].get("thinking"),
-                    signature_delta=content_block["delta"].get("signature"),
+                    thinking=content_block["delta"].get("thinking") or "",
+                    signature=content_block["delta"].get("signature"),
                )
            ]
            provider_specific_fields["thinking_blocks"] = thinking_blocks
@ -616,9 +620,11 @@ class ModelResponseIterator:
                        "index": self.tool_index,
                    }
            elif type_chunk == "content_block_stop":
+
                ContentBlockStop(**chunk)  # type: ignore
                # check if tool call content block
                is_empty = self.check_empty_tool_call_args()
+
                if is_empty:
                    tool_use = {
                        "id": None,
--- a/litellm/llms/anthropic/experimental_pass_through/messages/handler.py
+++ b/litellm/llms/anthropic/experimental_pass_through/messages/handler.py
@ -0,0 +1,179 @@
+"""
+- call /messages on Anthropic API
+- Make streaming + non-streaming request - just pass it through direct to Anthropic. No need to do anything special here 
+- Ensure requests are logged in the DB - stream + non-stream
+
+"""
+
+import json
+from typing import Any, AsyncIterator, Dict, Optional, Union, cast
+
+import httpx
+
+import litellm
+from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
+from litellm.llms.base_llm.anthropic_messages.transformation import (
+    BaseAnthropicMessagesConfig,
+)
+from litellm.llms.custom_httpx.http_handler import (
+    AsyncHTTPHandler,
+    get_async_httpx_client,
+)
+from litellm.types.router import GenericLiteLLMParams
+from litellm.types.utils import ProviderSpecificHeader
+from litellm.utils import ProviderConfigManager, client
+
+
+class AnthropicMessagesHandler:
+
+    @staticmethod
+    async def _handle_anthropic_streaming(
+        response: httpx.Response,
+        request_body: dict,
+        litellm_logging_obj: LiteLLMLoggingObj,
+    ) -> AsyncIterator:
+        """Helper function to handle Anthropic streaming responses using the existing logging handlers"""
+        from datetime import datetime
+
+        from litellm.proxy.pass_through_endpoints.streaming_handler import (
+            PassThroughStreamingHandler,
+        )
+        from litellm.proxy.pass_through_endpoints.success_handler import (
+            PassThroughEndpointLogging,
+        )
+        from litellm.proxy.pass_through_endpoints.types import EndpointType
+
+        # Create success handler object
+        passthrough_success_handler_obj = PassThroughEndpointLogging()
+
+        # Use the existing streaming handler for Anthropic
+        start_time = datetime.now()
+        return PassThroughStreamingHandler.chunk_processor(
+            response=response,
+            request_body=request_body,
+            litellm_logging_obj=litellm_logging_obj,
+            endpoint_type=EndpointType.ANTHROPIC,
+            start_time=start_time,
+            passthrough_success_handler_obj=passthrough_success_handler_obj,
+            url_route="/v1/messages",
+        )
+
+
+@client
+async def anthropic_messages(
+    api_key: str,
+    model: str,
+    stream: bool = False,
+    api_base: Optional[str] = None,
+    client: Optional[AsyncHTTPHandler] = None,
+    custom_llm_provider: Optional[str] = None,
+    **kwargs,
+) -> Union[Dict[str, Any], AsyncIterator]:
+    """
+    Makes Anthropic `/v1/messages` API calls In the Anthropic API Spec
+    """
+    # Use provided client or create a new one
+    optional_params = GenericLiteLLMParams(**kwargs)
+    model, _custom_llm_provider, dynamic_api_key, dynamic_api_base = (
+        litellm.get_llm_provider(
+            model=model,
+            custom_llm_provider=custom_llm_provider,
+            api_base=optional_params.api_base,
+            api_key=optional_params.api_key,
+        )
+    )
+    anthropic_messages_provider_config: Optional[BaseAnthropicMessagesConfig] = (
+        ProviderConfigManager.get_provider_anthropic_messages_config(
+            model=model,
+            provider=litellm.LlmProviders(_custom_llm_provider),
+        )
+    )
+    if anthropic_messages_provider_config is None:
+        raise ValueError(
+            f"Anthropic messages provider config not found for model: {model}"
+        )
+    if client is None or not isinstance(client, AsyncHTTPHandler):
+        async_httpx_client = get_async_httpx_client(
+            llm_provider=litellm.LlmProviders.ANTHROPIC
+        )
+    else:
+        async_httpx_client = client
+
+    litellm_logging_obj: LiteLLMLoggingObj = kwargs.get("litellm_logging_obj", None)
+
+    # Prepare headers
+    provider_specific_header = cast(
+        Optional[ProviderSpecificHeader], kwargs.get("provider_specific_header", None)
+    )
+    extra_headers = (
+        provider_specific_header.get("extra_headers", {})
+        if provider_specific_header
+        else {}
+    )
+    headers = anthropic_messages_provider_config.validate_environment(
+        headers=extra_headers or {},
+        model=model,
+        api_key=api_key,
+    )
+
+    litellm_logging_obj.update_environment_variables(
+        model=model,
+        optional_params=dict(optional_params),
+        litellm_params={
+            "metadata": kwargs.get("metadata", {}),
+            "preset_cache_key": None,
+            "stream_response": {},
+            **optional_params.model_dump(exclude_unset=True),
+        },
+        custom_llm_provider=_custom_llm_provider,
+    )
+    litellm_logging_obj.model_call_details.update(kwargs)
+
+    # Prepare request body
+    request_body = kwargs.copy()
+    request_body = {
+        k: v
+        for k, v in request_body.items()
+        if k
+        in anthropic_messages_provider_config.get_supported_anthropic_messages_params(
+            model=model
+        )
+    }
+    request_body["stream"] = stream
+    request_body["model"] = model
+    litellm_logging_obj.stream = stream
+
+    # Make the request
+    request_url = anthropic_messages_provider_config.get_complete_url(
+        api_base=api_base, model=model
+    )
+
+    litellm_logging_obj.pre_call(
+        input=[{"role": "user", "content": json.dumps(request_body)}],
+        api_key="",
+        additional_args={
+            "complete_input_dict": request_body,
+            "api_base": str(request_url),
+            "headers": headers,
+        },
+    )
+
+    response = await async_httpx_client.post(
+        url=request_url,
+        headers=headers,
+        data=json.dumps(request_body),
+        stream=stream,
+    )
+    response.raise_for_status()
+
+    # used for logging + cost tracking
+    litellm_logging_obj.model_call_details["httpx_response"] = response
+
+    if stream:
+        return await AnthropicMessagesHandler._handle_anthropic_streaming(
+            response=response,
+            request_body=request_body,
+            litellm_logging_obj=litellm_logging_obj,
+        )
+    else:
+        return response.json()
--- a/litellm/llms/anthropic/experimental_pass_through/messages/transformation.py
+++ b/litellm/llms/anthropic/experimental_pass_through/messages/transformation.py
@ -0,0 +1,47 @@
+from typing import Optional
+
+from litellm.llms.base_llm.anthropic_messages.transformation import (
+    BaseAnthropicMessagesConfig,
+)
+
+DEFAULT_ANTHROPIC_API_BASE = "https://api.anthropic.com"
+DEFAULT_ANTHROPIC_API_VERSION = "2023-06-01"
+
+
+class AnthropicMessagesConfig(BaseAnthropicMessagesConfig):
+    def get_supported_anthropic_messages_params(self, model: str) -> list:
+        return [
+            "messages",
+            "model",
+            "system",
+            "max_tokens",
+            "stop_sequences",
+            "temperature",
+            "top_p",
+            "top_k",
+            "tools",
+            "tool_choice",
+            "thinking",
+            # TODO: Add Anthropic `metadata` support
+            # "metadata",
+        ]
+
+    def get_complete_url(self, api_base: Optional[str], model: str) -> str:
+        api_base = api_base or DEFAULT_ANTHROPIC_API_BASE
+        if not api_base.endswith("/v1/messages"):
+            api_base = f"{api_base}/v1/messages"
+        return api_base
+
+    def validate_environment(
+        self,
+        headers: dict,
+        model: str,
+        api_key: Optional[str] = None,
+    ) -> dict:
+        if "x-api-key" not in headers:
+            headers["x-api-key"] = api_key
+        if "anthropic-version" not in headers:
+            headers["anthropic-version"] = DEFAULT_ANTHROPIC_API_VERSION
+        if "content-type" not in headers:
+            headers["content-type"] = "application/json"
+        return headers
--- a/litellm/llms/anthropic/experimental_pass_through/transformation.py
+++ b/litellm/llms/anthropic/experimental_pass_through/transformation.py
@ -1,412 +0,0 @@
-import json
-from typing import List, Literal, Optional, Tuple, Union
-
-from openai.types.chat.chat_completion_chunk import Choice as OpenAIStreamingChoice
-
-from litellm.types.llms.anthropic import (
-    AllAnthropicToolsValues,
-    AnthopicMessagesAssistantMessageParam,
-    AnthropicFinishReason,
-    AnthropicMessagesRequest,
-    AnthropicMessagesToolChoice,
-    AnthropicMessagesUserMessageParam,
-    AnthropicResponse,
-    AnthropicResponseContentBlockText,
-    AnthropicResponseContentBlockToolUse,
-    AnthropicResponseUsageBlock,
-    ContentBlockDelta,
-    ContentJsonBlockDelta,
-    ContentTextBlockDelta,
-    MessageBlockDelta,
-    MessageDelta,
-    UsageDelta,
-)
-from litellm.types.llms.openai import (
-    AllMessageValues,
-    ChatCompletionAssistantMessage,
-    ChatCompletionAssistantToolCall,
-    ChatCompletionImageObject,
-    ChatCompletionImageUrlObject,
-    ChatCompletionRequest,
-    ChatCompletionSystemMessage,
-    ChatCompletionTextObject,
-    ChatCompletionToolCallFunctionChunk,
-    ChatCompletionToolChoiceFunctionParam,
-    ChatCompletionToolChoiceObjectParam,
-    ChatCompletionToolChoiceValues,
-    ChatCompletionToolMessage,
-    ChatCompletionToolParam,
-    ChatCompletionToolParamFunctionChunk,
-    ChatCompletionUserMessage,
-)
-from litellm.types.utils import Choices, ModelResponse, Usage
-
-
-class AnthropicExperimentalPassThroughConfig:
-    def __init__(self):
-        pass
-
-    ### FOR [BETA] `/v1/messages` endpoint support
-
-    def translatable_anthropic_params(self) -> List:
-        """
-        Which anthropic params, we need to translate to the openai format.
-        """
-        return ["messages", "metadata", "system", "tool_choice", "tools"]
-
-    def translate_anthropic_messages_to_openai(  # noqa: PLR0915
-        self,
-        messages: List[
-            Union[
-                AnthropicMessagesUserMessageParam,
-                AnthopicMessagesAssistantMessageParam,
-            ]
-        ],
-    ) -> List:
-        new_messages: List[AllMessageValues] = []
-        for m in messages:
-            user_message: Optional[ChatCompletionUserMessage] = None
-            tool_message_list: List[ChatCompletionToolMessage] = []
-            new_user_content_list: List[
-                Union[ChatCompletionTextObject, ChatCompletionImageObject]
-            ] = []
-            ## USER MESSAGE ##
-            if m["role"] == "user":
-                ## translate user message
-                message_content = m.get("content")
-                if message_content and isinstance(message_content, str):
-                    user_message = ChatCompletionUserMessage(
-                        role="user", content=message_content
-                    )
-                elif message_content and isinstance(message_content, list):
-                    for content in message_content:
-                        if content["type"] == "text":
-                            text_obj = ChatCompletionTextObject(
-                                type="text", text=content["text"]
-                            )
-                            new_user_content_list.append(text_obj)
-                        elif content["type"] == "image":
-                            image_url = ChatCompletionImageUrlObject(
-                                url=f"data:{content['type']};base64,{content['source']}"
-                            )
-                            image_obj = ChatCompletionImageObject(
-                                type="image_url", image_url=image_url
-                            )
-
-                            new_user_content_list.append(image_obj)
-                        elif content["type"] == "tool_result":
-                            if "content" not in content:
-                                tool_result = ChatCompletionToolMessage(
-                                    role="tool",
-                                    tool_call_id=content["tool_use_id"],
-                                    content="",
-                                )
-                                tool_message_list.append(tool_result)
-                            elif isinstance(content["content"], str):
-                                tool_result = ChatCompletionToolMessage(
-                                    role="tool",
-                                    tool_call_id=content["tool_use_id"],
-                                    content=content["content"],
-                                )
-                                tool_message_list.append(tool_result)
-                            elif isinstance(content["content"], list):
-                                for c in content["content"]:
-                                    if c["type"] == "text":
-                                        tool_result = ChatCompletionToolMessage(
-                                            role="tool",
-                                            tool_call_id=content["tool_use_id"],
-                                            content=c["text"],
-                                        )
-                                        tool_message_list.append(tool_result)
-                                    elif c["type"] == "image":
-                                        image_str = (
-                                            f"data:{c['type']};base64,{c['source']}"
-                                        )
-                                        tool_result = ChatCompletionToolMessage(
-                                            role="tool",
-                                            tool_call_id=content["tool_use_id"],
-                                            content=image_str,
-                                        )
-                                        tool_message_list.append(tool_result)
-
-            if user_message is not None:
-                new_messages.append(user_message)
-
-            if len(new_user_content_list) > 0:
-                new_messages.append({"role": "user", "content": new_user_content_list})  # type: ignore
-
-            if len(tool_message_list) > 0:
-                new_messages.extend(tool_message_list)
-
-            ## ASSISTANT MESSAGE ##
-            assistant_message_str: Optional[str] = None
-            tool_calls: List[ChatCompletionAssistantToolCall] = []
-            if m["role"] == "assistant":
-                if isinstance(m["content"], str):
-                    assistant_message_str = m["content"]
-                elif isinstance(m["content"], list):
-                    for content in m["content"]:
-                        if content["type"] == "text":
-                            if assistant_message_str is None:
-                                assistant_message_str = content["text"]
-                            else:
-                                assistant_message_str += content["text"]
-                        elif content["type"] == "tool_use":
-                            function_chunk = ChatCompletionToolCallFunctionChunk(
-                                name=content["name"],
-                                arguments=json.dumps(content["input"]),
-                            )
-
-                            tool_calls.append(
-                                ChatCompletionAssistantToolCall(
-                                    id=content["id"],
-                                    type="function",
-                                    function=function_chunk,
-                                )
-                            )
-
-            if assistant_message_str is not None or len(tool_calls) > 0:
-                assistant_message = ChatCompletionAssistantMessage(
-                    role="assistant",
-                    content=assistant_message_str,
-                )
-                if len(tool_calls) > 0:
-                    assistant_message["tool_calls"] = tool_calls
-                new_messages.append(assistant_message)
-
-        return new_messages
-
-    def translate_anthropic_tool_choice_to_openai(
-        self, tool_choice: AnthropicMessagesToolChoice
-    ) -> ChatCompletionToolChoiceValues:
-        if tool_choice["type"] == "any":
-            return "required"
-        elif tool_choice["type"] == "auto":
-            return "auto"
-        elif tool_choice["type"] == "tool":
-            tc_function_param = ChatCompletionToolChoiceFunctionParam(
-                name=tool_choice.get("name", "")
-            )
-            return ChatCompletionToolChoiceObjectParam(
-                type="function", function=tc_function_param
-            )
-        else:
-            raise ValueError(
-                "Incompatible tool choice param submitted - {}".format(tool_choice)
-            )
-
-    def translate_anthropic_tools_to_openai(
-        self, tools: List[AllAnthropicToolsValues]
-    ) -> List[ChatCompletionToolParam]:
-        new_tools: List[ChatCompletionToolParam] = []
-        mapped_tool_params = ["name", "input_schema", "description"]
-        for tool in tools:
-            function_chunk = ChatCompletionToolParamFunctionChunk(
-                name=tool["name"],
-            )
-            if "input_schema" in tool:
-                function_chunk["parameters"] = tool["input_schema"]  # type: ignore
-            if "description" in tool:
-                function_chunk["description"] = tool["description"]  # type: ignore
-
-            for k, v in tool.items():
-                if k not in mapped_tool_params:  # pass additional computer kwargs
-                    function_chunk.setdefault("parameters", {}).update({k: v})
-            new_tools.append(
-                ChatCompletionToolParam(type="function", function=function_chunk)
-            )
-
-        return new_tools
-
-    def translate_anthropic_to_openai(
-        self, anthropic_message_request: AnthropicMessagesRequest
-    ) -> ChatCompletionRequest:
-        """
-        This is used by the beta Anthropic Adapter, for translating anthropic `/v1/messages` requests to the openai format.
-        """
-        new_messages: List[AllMessageValues] = []
-
-        ## CONVERT ANTHROPIC MESSAGES TO OPENAI
-        new_messages = self.translate_anthropic_messages_to_openai(
-            messages=anthropic_message_request["messages"]
-        )
-        ## ADD SYSTEM MESSAGE TO MESSAGES
-        if "system" in anthropic_message_request:
-            new_messages.insert(
-                0,
-                ChatCompletionSystemMessage(
-                    role="system", content=anthropic_message_request["system"]
-                ),
-            )
-
-        new_kwargs: ChatCompletionRequest = {
-            "model": anthropic_message_request["model"],
-            "messages": new_messages,
-        }
-        ## CONVERT METADATA (user_id)
-        if "metadata" in anthropic_message_request:
-            if "user_id" in anthropic_message_request["metadata"]:
-                new_kwargs["user"] = anthropic_message_request["metadata"]["user_id"]
-
-        # Pass litellm proxy specific metadata
-        if "litellm_metadata" in anthropic_message_request:
-            # metadata will be passed to litellm.acompletion(), it's a litellm_param
-            new_kwargs["metadata"] = anthropic_message_request.pop("litellm_metadata")
-
-        ## CONVERT TOOL CHOICE
-        if "tool_choice" in anthropic_message_request:
-            new_kwargs["tool_choice"] = self.translate_anthropic_tool_choice_to_openai(
-                tool_choice=anthropic_message_request["tool_choice"]
-            )
-        ## CONVERT TOOLS
-        if "tools" in anthropic_message_request:
-            new_kwargs["tools"] = self.translate_anthropic_tools_to_openai(
-                tools=anthropic_message_request["tools"]
-            )
-
-        translatable_params = self.translatable_anthropic_params()
-        for k, v in anthropic_message_request.items():
-            if k not in translatable_params:  # pass remaining params as is
-                new_kwargs[k] = v  # type: ignore
-
-        return new_kwargs
-
-    def _translate_openai_content_to_anthropic(
-        self, choices: List[Choices]
-    ) -> List[
-        Union[AnthropicResponseContentBlockText, AnthropicResponseContentBlockToolUse]
-    ]:
-        new_content: List[
-            Union[
-                AnthropicResponseContentBlockText, AnthropicResponseContentBlockToolUse
-            ]
-        ] = []
-        for choice in choices:
-            if (
-                choice.message.tool_calls is not None
-                and len(choice.message.tool_calls) > 0
-            ):
-                for tool_call in choice.message.tool_calls:
-                    new_content.append(
-                        AnthropicResponseContentBlockToolUse(
-                            type="tool_use",
-                            id=tool_call.id,
-                            name=tool_call.function.name or "",
-                            input=json.loads(tool_call.function.arguments),
-                        )
-                    )
-            elif choice.message.content is not None:
-                new_content.append(
-                    AnthropicResponseContentBlockText(
-                        type="text", text=choice.message.content
-                    )
-                )
-
-        return new_content
-
-    def _translate_openai_finish_reason_to_anthropic(
-        self, openai_finish_reason: str
-    ) -> AnthropicFinishReason:
-        if openai_finish_reason == "stop":
-            return "end_turn"
-        elif openai_finish_reason == "length":
-            return "max_tokens"
-        elif openai_finish_reason == "tool_calls":
-            return "tool_use"
-        return "end_turn"
-
-    def translate_openai_response_to_anthropic(
-        self, response: ModelResponse
-    ) -> AnthropicResponse:
-        ## translate content block
-        anthropic_content = self._translate_openai_content_to_anthropic(choices=response.choices)  # type: ignore
-        ## extract finish reason
-        anthropic_finish_reason = self._translate_openai_finish_reason_to_anthropic(
-            openai_finish_reason=response.choices[0].finish_reason  # type: ignore
-        )
-        # extract usage
-        usage: Usage = getattr(response, "usage")
-        anthropic_usage = AnthropicResponseUsageBlock(
-            input_tokens=usage.prompt_tokens or 0,
-            output_tokens=usage.completion_tokens or 0,
-        )
-        translated_obj = AnthropicResponse(
-            id=response.id,
-            type="message",
-            role="assistant",
-            model=response.model or "unknown-model",
-            stop_sequence=None,
-            usage=anthropic_usage,
-            content=anthropic_content,
-            stop_reason=anthropic_finish_reason,
-        )
-
-        return translated_obj
-
-    def _translate_streaming_openai_chunk_to_anthropic(
-        self, choices: List[OpenAIStreamingChoice]
-    ) -> Tuple[
-        Literal["text_delta", "input_json_delta"],
-        Union[ContentTextBlockDelta, ContentJsonBlockDelta],
-    ]:
-        text: str = ""
-        partial_json: Optional[str] = None
-        for choice in choices:
-            if choice.delta.content is not None:
-                text += choice.delta.content
-            elif choice.delta.tool_calls is not None:
-                partial_json = ""
-                for tool in choice.delta.tool_calls:
-                    if (
-                        tool.function is not None
-                        and tool.function.arguments is not None
-                    ):
-                        partial_json += tool.function.arguments
-
-        if partial_json is not None:
-            return "input_json_delta", ContentJsonBlockDelta(
-                type="input_json_delta", partial_json=partial_json
-            )
-        else:
-            return "text_delta", ContentTextBlockDelta(type="text_delta", text=text)
-
-    def translate_streaming_openai_response_to_anthropic(
-        self, response: ModelResponse
-    ) -> Union[ContentBlockDelta, MessageBlockDelta]:
-        ## base case - final chunk w/ finish reason
-        if response.choices[0].finish_reason is not None:
-            delta = MessageDelta(
-                stop_reason=self._translate_openai_finish_reason_to_anthropic(
-                    response.choices[0].finish_reason
-                ),
-            )
-            if getattr(response, "usage", None) is not None:
-                litellm_usage_chunk: Optional[Usage] = response.usage  # type: ignore
-            elif (
-                hasattr(response, "_hidden_params")
-                and "usage" in response._hidden_params
-            ):
-                litellm_usage_chunk = response._hidden_params["usage"]
-            else:
-                litellm_usage_chunk = None
-            if litellm_usage_chunk is not None:
-                usage_delta = UsageDelta(
-                    input_tokens=litellm_usage_chunk.prompt_tokens or 0,
-                    output_tokens=litellm_usage_chunk.completion_tokens or 0,
-                )
-            else:
-                usage_delta = UsageDelta(input_tokens=0, output_tokens=0)
-            return MessageBlockDelta(
-                type="message_delta", delta=delta, usage=usage_delta
-            )
-        (
-            type_of_content,
-            content_block_delta,
-        ) = self._translate_streaming_openai_chunk_to_anthropic(
-            choices=response.choices  # type: ignore
-        )
-        return ContentBlockDelta(
-            type="content_block_delta",
-            index=response.choices[0].index,
-            delta=content_block_delta,
-        )
--- a/litellm/llms/azure/assistants.py
+++ b/litellm/llms/azure/assistants.py
@ -1,4 +1,4 @@
-from typing import Coroutine, Iterable, Literal, Optional, Union
+from typing import Any, Coroutine, Dict, Iterable, Literal, Optional, Union

 import httpx
 from openai import AsyncAzureOpenAI, AzureOpenAI
@ -18,10 +18,10 @@ from ...types.llms.openai import (
    SyncCursorPage,
    Thread,
 )
-from ..base import BaseLLM
+from .common_utils import BaseAzureLLM


-class AzureAssistantsAPI(BaseLLM):
+class AzureAssistantsAPI(BaseAzureLLM):
    def __init__(self) -> None:
        super().__init__()

@ -34,18 +34,17 @@ class AzureAssistantsAPI(BaseLLM):
        timeout: Union[float, httpx.Timeout],
        max_retries: Optional[int],
        client: Optional[AzureOpenAI] = None,
+        litellm_params: Optional[dict] = None,
    ) -> AzureOpenAI:
-        received_args = locals()
        if client is None:
-            data = {}
-            for k, v in received_args.items():
-                if k == "self" or k == "client":
-                    pass
-                elif k == "api_base" and v is not None:
-                    data["azure_endpoint"] = v
-                elif v is not None:
-                    data[k] = v
-            azure_openai_client = AzureOpenAI(**data)  # type: ignore
+            azure_client_params = self.initialize_azure_sdk_client(
+                litellm_params=litellm_params or {},
+                api_key=api_key,
+                api_base=api_base,
+                model_name="",
+                api_version=api_version,
+            )
+            azure_openai_client = AzureOpenAI(**azure_client_params)  # type: ignore
        else:
            azure_openai_client = client

@ -60,18 +59,18 @@ class AzureAssistantsAPI(BaseLLM):
        timeout: Union[float, httpx.Timeout],
        max_retries: Optional[int],
        client: Optional[AsyncAzureOpenAI] = None,
+        litellm_params: Optional[dict] = None,
    ) -> AsyncAzureOpenAI:
-        received_args = locals()
        if client is None:
-            data = {}
-            for k, v in received_args.items():
-                if k == "self" or k == "client":
-                    pass
-                elif k == "api_base" and v is not None:
-                    data["azure_endpoint"] = v
-                elif v is not None:
-                    data[k] = v
-            azure_openai_client = AsyncAzureOpenAI(**data)
+            azure_client_params = self.initialize_azure_sdk_client(
+                litellm_params=litellm_params or {},
+                api_key=api_key,
+                api_base=api_base,
+                model_name="",
+                api_version=api_version,
+            )
+
+            azure_openai_client = AsyncAzureOpenAI(**azure_client_params)
            # azure_openai_client = AsyncAzureOpenAI(**data)  # type: ignore
        else:
            azure_openai_client = client
@ -89,6 +88,7 @@ class AzureAssistantsAPI(BaseLLM):
        timeout: Union[float, httpx.Timeout],
        max_retries: Optional[int],
        client: Optional[AsyncAzureOpenAI],
+        litellm_params: Optional[dict] = None,
    ) -> AsyncCursorPage[Assistant]:
        azure_openai_client = self.async_get_azure_client(
            api_key=api_key,
@ -98,6 +98,7 @@ class AzureAssistantsAPI(BaseLLM):
            timeout=timeout,
            max_retries=max_retries,
            client=client,
+            litellm_params=litellm_params,
        )

        response = await azure_openai_client.beta.assistants.list()
@ -146,6 +147,7 @@ class AzureAssistantsAPI(BaseLLM):
        max_retries: Optional[int],
        client=None,
        aget_assistants=None,
+        litellm_params: Optional[dict] = None,
    ):
        if aget_assistants is not None and aget_assistants is True:
            return self.async_get_assistants(
@ -156,6 +158,7 @@ class AzureAssistantsAPI(BaseLLM):
                timeout=timeout,
                max_retries=max_retries,
                client=client,
+                litellm_params=litellm_params,
            )
        azure_openai_client = self.get_azure_client(
            api_key=api_key,
@ -165,6 +168,7 @@ class AzureAssistantsAPI(BaseLLM):
            max_retries=max_retries,
            client=client,
            api_version=api_version,
+            litellm_params=litellm_params,
        )

        response = azure_openai_client.beta.assistants.list()
@ -184,6 +188,7 @@ class AzureAssistantsAPI(BaseLLM):
        timeout: Union[float, httpx.Timeout],
        max_retries: Optional[int],
        client: Optional[AsyncAzureOpenAI] = None,
+        litellm_params: Optional[dict] = None,
    ) -> OpenAIMessage:
        openai_client = self.async_get_azure_client(
            api_key=api_key,
@ -193,6 +198,7 @@ class AzureAssistantsAPI(BaseLLM):
            timeout=timeout,
            max_retries=max_retries,
            client=client,
+            litellm_params=litellm_params,
        )

        thread_message: OpenAIMessage = await openai_client.beta.threads.messages.create(  # type: ignore
@ -222,6 +228,7 @@ class AzureAssistantsAPI(BaseLLM):
        max_retries: Optional[int],
        client: Optional[AsyncAzureOpenAI],
        a_add_message: Literal[True],
+        litellm_params: Optional[dict] = None,
    ) -> Coroutine[None, None, OpenAIMessage]:
        ...

@ -238,6 +245,7 @@ class AzureAssistantsAPI(BaseLLM):
        max_retries: Optional[int],
        client: Optional[AzureOpenAI],
        a_add_message: Optional[Literal[False]],
+        litellm_params: Optional[dict] = None,
    ) -> OpenAIMessage:
        ...

@ -255,6 +263,7 @@ class AzureAssistantsAPI(BaseLLM):
        max_retries: Optional[int],
        client=None,
        a_add_message: Optional[bool] = None,
+        litellm_params: Optional[dict] = None,
    ):
        if a_add_message is not None and a_add_message is True:
            return self.a_add_message(
@ -267,6 +276,7 @@ class AzureAssistantsAPI(BaseLLM):
                timeout=timeout,
                max_retries=max_retries,
                client=client,
+                litellm_params=litellm_params,
            )
        openai_client = self.get_azure_client(
            api_key=api_key,
@ -300,6 +310,7 @@ class AzureAssistantsAPI(BaseLLM):
        timeout: Union[float, httpx.Timeout],
        max_retries: Optional[int],
        client: Optional[AsyncAzureOpenAI] = None,
+        litellm_params: Optional[dict] = None,
    ) -> AsyncCursorPage[OpenAIMessage]:
        openai_client = self.async_get_azure_client(
            api_key=api_key,
@ -309,6 +320,7 @@ class AzureAssistantsAPI(BaseLLM):
            timeout=timeout,
            max_retries=max_retries,
            client=client,
+            litellm_params=litellm_params,
        )

        response = await openai_client.beta.threads.messages.list(thread_id=thread_id)
@ -329,6 +341,7 @@ class AzureAssistantsAPI(BaseLLM):
        max_retries: Optional[int],
        client: Optional[AsyncAzureOpenAI],
        aget_messages: Literal[True],
+        litellm_params: Optional[dict] = None,
    ) -> Coroutine[None, None, AsyncCursorPage[OpenAIMessage]]:
        ...

@ -344,6 +357,7 @@ class AzureAssistantsAPI(BaseLLM):
        max_retries: Optional[int],
        client: Optional[AzureOpenAI],
        aget_messages: Optional[Literal[False]],
+        litellm_params: Optional[dict] = None,
    ) -> SyncCursorPage[OpenAIMessage]:
        ...

@ -360,6 +374,7 @@ class AzureAssistantsAPI(BaseLLM):
        max_retries: Optional[int],
        client=None,
        aget_messages=None,
+        litellm_params: Optional[dict] = None,
    ):
        if aget_messages is not None and aget_messages is True:
            return self.async_get_messages(
@ -371,6 +386,7 @@ class AzureAssistantsAPI(BaseLLM):
                timeout=timeout,
                max_retries=max_retries,
                client=client,
+                litellm_params=litellm_params,
            )
        openai_client = self.get_azure_client(
            api_key=api_key,
@ -380,6 +396,7 @@ class AzureAssistantsAPI(BaseLLM):
            timeout=timeout,
            max_retries=max_retries,
            client=client,
+            litellm_params=litellm_params,
        )

        response = openai_client.beta.threads.messages.list(thread_id=thread_id)
@ -399,6 +416,7 @@ class AzureAssistantsAPI(BaseLLM):
        max_retries: Optional[int],
        client: Optional[AsyncAzureOpenAI],
        messages: Optional[Iterable[OpenAICreateThreadParamsMessage]],
+        litellm_params: Optional[dict] = None,
    ) -> Thread:
        openai_client = self.async_get_azure_client(
            api_key=api_key,
@ -408,6 +426,7 @@ class AzureAssistantsAPI(BaseLLM):
            timeout=timeout,
            max_retries=max_retries,
            client=client,
+            litellm_params=litellm_params,
        )

        data = {}
@ -435,6 +454,7 @@ class AzureAssistantsAPI(BaseLLM):
        messages: Optional[Iterable[OpenAICreateThreadParamsMessage]],
        client: Optional[AsyncAzureOpenAI],
        acreate_thread: Literal[True],
+        litellm_params: Optional[dict] = None,
    ) -> Coroutine[None, None, Thread]:
        ...

@ -451,6 +471,7 @@ class AzureAssistantsAPI(BaseLLM):
        messages: Optional[Iterable[OpenAICreateThreadParamsMessage]],
        client: Optional[AzureOpenAI],
        acreate_thread: Optional[Literal[False]],
+        litellm_params: Optional[dict] = None,
    ) -> Thread:
        ...

@ -468,6 +489,7 @@ class AzureAssistantsAPI(BaseLLM):
        messages: Optional[Iterable[OpenAICreateThreadParamsMessage]],
        client=None,
        acreate_thread=None,
+        litellm_params: Optional[dict] = None,
    ):
        """
        Here's an example:
@ -490,6 +512,7 @@ class AzureAssistantsAPI(BaseLLM):
                max_retries=max_retries,
                client=client,
                messages=messages,
+                litellm_params=litellm_params,
            )
        azure_openai_client = self.get_azure_client(
            api_key=api_key,
@ -499,6 +522,7 @@ class AzureAssistantsAPI(BaseLLM):
            timeout=timeout,
            max_retries=max_retries,
            client=client,
+            litellm_params=litellm_params,
        )

        data = {}
@ -521,6 +545,7 @@ class AzureAssistantsAPI(BaseLLM):
        timeout: Union[float, httpx.Timeout],
        max_retries: Optional[int],
        client: Optional[AsyncAzureOpenAI],
+        litellm_params: Optional[dict] = None,
    ) -> Thread:
        openai_client = self.async_get_azure_client(
            api_key=api_key,
@ -530,6 +555,7 @@ class AzureAssistantsAPI(BaseLLM):
            timeout=timeout,
            max_retries=max_retries,
            client=client,
+            litellm_params=litellm_params,
        )

        response = await openai_client.beta.threads.retrieve(thread_id=thread_id)
@ -550,6 +576,7 @@ class AzureAssistantsAPI(BaseLLM):
        max_retries: Optional[int],
        client: Optional[AsyncAzureOpenAI],
        aget_thread: Literal[True],
+        litellm_params: Optional[dict] = None,
    ) -> Coroutine[None, None, Thread]:
        ...

@ -565,6 +592,7 @@ class AzureAssistantsAPI(BaseLLM):
        max_retries: Optional[int],
        client: Optional[AzureOpenAI],
        aget_thread: Optional[Literal[False]],
+        litellm_params: Optional[dict] = None,
    ) -> Thread:
        ...

@ -581,6 +609,7 @@ class AzureAssistantsAPI(BaseLLM):
        max_retries: Optional[int],
        client=None,
        aget_thread=None,
+        litellm_params: Optional[dict] = None,
    ):
        if aget_thread is not None and aget_thread is True:
            return self.async_get_thread(
@ -592,6 +621,7 @@ class AzureAssistantsAPI(BaseLLM):
                timeout=timeout,
                max_retries=max_retries,
                client=client,
+                litellm_params=litellm_params,
            )
        openai_client = self.get_azure_client(
            api_key=api_key,
@ -601,6 +631,7 @@ class AzureAssistantsAPI(BaseLLM):
            timeout=timeout,
            max_retries=max_retries,
            client=client,
+            litellm_params=litellm_params,
        )

        response = openai_client.beta.threads.retrieve(thread_id=thread_id)
@ -618,7 +649,7 @@ class AzureAssistantsAPI(BaseLLM):
        assistant_id: str,
        additional_instructions: Optional[str],
        instructions: Optional[str],
-        metadata: Optional[object],
+        metadata: Optional[Dict],
        model: Optional[str],
        stream: Optional[bool],
        tools: Optional[Iterable[AssistantToolParam]],
@ -629,6 +660,7 @@ class AzureAssistantsAPI(BaseLLM):
        timeout: Union[float, httpx.Timeout],
        max_retries: Optional[int],
        client: Optional[AsyncAzureOpenAI],
+        litellm_params: Optional[dict] = None,
    ) -> Run:
        openai_client = self.async_get_azure_client(
            api_key=api_key,
@ -638,6 +670,7 @@ class AzureAssistantsAPI(BaseLLM):
            api_version=api_version,
            azure_ad_token=azure_ad_token,
            client=client,
+            litellm_params=litellm_params,
        )

        response = await openai_client.beta.threads.runs.create_and_poll(  # type: ignore
@ -645,7 +678,7 @@ class AzureAssistantsAPI(BaseLLM):
            assistant_id=assistant_id,
            additional_instructions=additional_instructions,
            instructions=instructions,
-            metadata=metadata,
+            metadata=metadata,  # type: ignore
            model=model,
            tools=tools,
        )
@ -659,12 +692,13 @@ class AzureAssistantsAPI(BaseLLM):
        assistant_id: str,
        additional_instructions: Optional[str],
        instructions: Optional[str],
-        metadata: Optional[object],
+        metadata: Optional[Dict],
        model: Optional[str],
        tools: Optional[Iterable[AssistantToolParam]],
        event_handler: Optional[AssistantEventHandler],
+        litellm_params: Optional[dict] = None,
    ) -> AsyncAssistantStreamManager[AsyncAssistantEventHandler]:
-        data = {
+        data: Dict[str, Any] = {
            "thread_id": thread_id,
            "assistant_id": assistant_id,
            "additional_instructions": additional_instructions,
@ -684,12 +718,13 @@ class AzureAssistantsAPI(BaseLLM):
        assistant_id: str,
        additional_instructions: Optional[str],
        instructions: Optional[str],
-        metadata: Optional[object],
+        metadata: Optional[Dict],
        model: Optional[str],
        tools: Optional[Iterable[AssistantToolParam]],
        event_handler: Optional[AssistantEventHandler],
+        litellm_params: Optional[dict] = None,
    ) -> AssistantStreamManager[AssistantEventHandler]:
-        data = {
+        data: Dict[str, Any] = {
            "thread_id": thread_id,
            "assistant_id": assistant_id,
            "additional_instructions": additional_instructions,
@ -711,7 +746,7 @@ class AzureAssistantsAPI(BaseLLM):
        assistant_id: str,
        additional_instructions: Optional[str],
        instructions: Optional[str],
-        metadata: Optional[object],
+        metadata: Optional[Dict],
        model: Optional[str],
        stream: Optional[bool],
        tools: Optional[Iterable[AssistantToolParam]],
@ -733,7 +768,7 @@ class AzureAssistantsAPI(BaseLLM):
        assistant_id: str,
        additional_instructions: Optional[str],
        instructions: Optional[str],
-        metadata: Optional[object],
+        metadata: Optional[Dict],
        model: Optional[str],
        stream: Optional[bool],
        tools: Optional[Iterable[AssistantToolParam]],
@ -756,7 +791,7 @@ class AzureAssistantsAPI(BaseLLM):
        assistant_id: str,
        additional_instructions: Optional[str],
        instructions: Optional[str],
-        metadata: Optional[object],
+        metadata: Optional[Dict],
        model: Optional[str],
        stream: Optional[bool],
        tools: Optional[Iterable[AssistantToolParam]],
@ -769,6 +804,7 @@ class AzureAssistantsAPI(BaseLLM):
        client=None,
        arun_thread=None,
        event_handler: Optional[AssistantEventHandler] = None,
+        litellm_params: Optional[dict] = None,
    ):
        if arun_thread is not None and arun_thread is True:
            if stream is not None and stream is True:
@ -780,6 +816,7 @@ class AzureAssistantsAPI(BaseLLM):
                    timeout=timeout,
                    max_retries=max_retries,
                    client=client,
+                    litellm_params=litellm_params,
                )
                return self.async_run_thread_stream(
                    client=azure_client,
@ -791,13 +828,14 @@ class AzureAssistantsAPI(BaseLLM):
                    model=model,
                    tools=tools,
                    event_handler=event_handler,
+                    litellm_params=litellm_params,
                )
            return self.arun_thread(
                thread_id=thread_id,
                assistant_id=assistant_id,
                additional_instructions=additional_instructions,
                instructions=instructions,
-                metadata=metadata,
+                metadata=metadata,  # type: ignore
                model=model,
                stream=stream,
                tools=tools,
@ -808,6 +846,7 @@ class AzureAssistantsAPI(BaseLLM):
                timeout=timeout,
                max_retries=max_retries,
                client=client,
+                litellm_params=litellm_params,
            )
        openai_client = self.get_azure_client(
            api_key=api_key,
@ -817,6 +856,7 @@ class AzureAssistantsAPI(BaseLLM):
            timeout=timeout,
            max_retries=max_retries,
            client=client,
+            litellm_params=litellm_params,
        )

        if stream is not None and stream is True:
@ -830,6 +870,7 @@ class AzureAssistantsAPI(BaseLLM):
                model=model,
                tools=tools,
                event_handler=event_handler,
+                litellm_params=litellm_params,
            )

        response = openai_client.beta.threads.runs.create_and_poll(  # type: ignore
@ -837,7 +878,7 @@ class AzureAssistantsAPI(BaseLLM):
            assistant_id=assistant_id,
            additional_instructions=additional_instructions,
            instructions=instructions,
-            metadata=metadata,
+            metadata=metadata,  # type: ignore
            model=model,
            tools=tools,
        )
@ -855,6 +896,7 @@ class AzureAssistantsAPI(BaseLLM):
        max_retries: Optional[int],
        client: Optional[AsyncAzureOpenAI],
        create_assistant_data: dict,
+        litellm_params: Optional[dict] = None,
    ) -> Assistant:
        azure_openai_client = self.async_get_azure_client(
            api_key=api_key,
@ -864,6 +906,7 @@ class AzureAssistantsAPI(BaseLLM):
            timeout=timeout,
            max_retries=max_retries,
            client=client,
+            litellm_params=litellm_params,
        )

        response = await azure_openai_client.beta.assistants.create(
@ -882,6 +925,7 @@ class AzureAssistantsAPI(BaseLLM):
        create_assistant_data: dict,
        client=None,
        async_create_assistants=None,
+        litellm_params: Optional[dict] = None,
    ):
        if async_create_assistants is not None and async_create_assistants is True:
            return self.async_create_assistants(
@ -893,6 +937,7 @@ class AzureAssistantsAPI(BaseLLM):
                max_retries=max_retries,
                client=client,
                create_assistant_data=create_assistant_data,
+                litellm_params=litellm_params,
            )
        azure_openai_client = self.get_azure_client(
            api_key=api_key,
@ -902,6 +947,7 @@ class AzureAssistantsAPI(BaseLLM):
            timeout=timeout,
            max_retries=max_retries,
            client=client,
+            litellm_params=litellm_params,
        )

        response = azure_openai_client.beta.assistants.create(**create_assistant_data)
@ -918,6 +964,7 @@ class AzureAssistantsAPI(BaseLLM):
        max_retries: Optional[int],
        client: Optional[AsyncAzureOpenAI],
        assistant_id: str,
+        litellm_params: Optional[dict] = None,
    ):
        azure_openai_client = self.async_get_azure_client(
            api_key=api_key,
@ -927,6 +974,7 @@ class AzureAssistantsAPI(BaseLLM):
            timeout=timeout,
            max_retries=max_retries,
            client=client,
+            litellm_params=litellm_params,
        )

        response = await azure_openai_client.beta.assistants.delete(
@ -945,6 +993,7 @@ class AzureAssistantsAPI(BaseLLM):
        assistant_id: str,
        async_delete_assistants: Optional[bool] = None,
        client=None,
+        litellm_params: Optional[dict] = None,
    ):
        if async_delete_assistants is not None and async_delete_assistants is True:
            return self.async_delete_assistant(
@ -956,6 +1005,7 @@ class AzureAssistantsAPI(BaseLLM):
                max_retries=max_retries,
                client=client,
                assistant_id=assistant_id,
+                litellm_params=litellm_params,
            )
        azure_openai_client = self.get_azure_client(
            api_key=api_key,
@ -965,6 +1015,7 @@ class AzureAssistantsAPI(BaseLLM):
            timeout=timeout,
            max_retries=max_retries,
            client=client,
+            litellm_params=litellm_params,
        )

        response = azure_openai_client.beta.assistants.delete(assistant_id=assistant_id)
--- a/litellm/llms/azure/audio_transcriptions.py
+++ b/litellm/llms/azure/audio_transcriptions.py
@ -7,14 +7,14 @@ from pydantic import BaseModel
 import litellm
 from litellm.litellm_core_utils.audio_utils.utils import get_audio_file_name
 from litellm.types.utils import FileTypes
-from litellm.utils import TranscriptionResponse, convert_to_model_response_object
-
-from .azure import (
-    AzureChatCompletion,
-    get_azure_ad_token_from_oidc,
-    select_azure_base_url_or_endpoint,
+from litellm.utils import (
+    TranscriptionResponse,
+    convert_to_model_response_object,
+    extract_duration_from_srt_or_vtt,
 )

+from .azure import AzureChatCompletion
+

 class AzureAudioTranscription(AzureChatCompletion):
    def audio_transcriptions(
@ -32,29 +32,18 @@ class AzureAudioTranscription(AzureChatCompletion):
        client=None,
        azure_ad_token: Optional[str] = None,
        atranscription: bool = False,
+        litellm_params: Optional[dict] = None,
    ) -> TranscriptionResponse:
        data = {"model": model, "file": audio_file, **optional_params}

        # init AzureOpenAI Client
-        azure_client_params = {
-            "api_version": api_version,
-            "azure_endpoint": api_base,
-            "azure_deployment": model,
-            "timeout": timeout,
-        }
-
-        azure_client_params = select_azure_base_url_or_endpoint(
-            azure_client_params=azure_client_params
+        azure_client_params = self.initialize_azure_sdk_client(
+            litellm_params=litellm_params or {},
+            api_key=api_key,
+            model_name=model,
+            api_version=api_version,
+            api_base=api_base,
        )
-        if api_key is not None:
-            azure_client_params["api_key"] = api_key
-        elif azure_ad_token is not None:
-            if azure_ad_token.startswith("oidc/"):
-                azure_ad_token = get_azure_ad_token_from_oidc(azure_ad_token)
-            azure_client_params["azure_ad_token"] = azure_ad_token
-
-        if max_retries is not None:
-            azure_client_params["max_retries"] = max_retries

        if atranscription is True:
            return self.async_audio_transcriptions(  # type: ignore
@ -124,7 +113,6 @@ class AzureAudioTranscription(AzureChatCompletion):
            if client is None:
                async_azure_client = AsyncAzureOpenAI(
                    **azure_client_params,
-                    http_client=litellm.aclient_session,
                )
            else:
                async_azure_client = client
@ -156,6 +144,8 @@ class AzureAudioTranscription(AzureChatCompletion):
                stringified_response = response.model_dump()
            else:
                stringified_response = TranscriptionResponse(text=response).model_dump()
+                duration = extract_duration_from_srt_or_vtt(response)
+                stringified_response["duration"] = duration

            ## LOGGING
            logging_obj.post_call(
--- a/litellm/llms/azure/azure.py
+++ b/litellm/llms/azure/azure.py
@ -1,6 +1,5 @@
 import asyncio
 import json
-import os
 import time
 from typing import Any, Callable, Dict, List, Literal, Optional, Union

@ -8,7 +7,6 @@ import httpx  # type: ignore
 from openai import APITimeoutError, AsyncAzureOpenAI, AzureOpenAI

 import litellm
-from litellm.caching.caching import DualCache
 from litellm.constants import DEFAULT_MAX_RETRIES
 from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
 from litellm.llms.custom_httpx.http_handler import (
@ -25,15 +23,18 @@ from litellm.types.utils import (
 from litellm.utils import (
    CustomStreamWrapper,
    convert_to_model_response_object,
-    get_secret,
    modify_url,
 )

 from ...types.llms.openai import HttpxBinaryResponseContent
 from ..base import BaseLLM
-from .common_utils import AzureOpenAIError, process_azure_headers
-
-azure_ad_cache = DualCache()
+from .common_utils import (
+    AzureOpenAIError,
+    BaseAzureLLM,
+    get_azure_ad_token_from_oidc,
+    process_azure_headers,
+    select_azure_base_url_or_endpoint,
+)


 class AzureOpenAIAssistantsAPIConfig:
@ -98,93 +99,6 @@ class AzureOpenAIAssistantsAPIConfig:
        return optional_params


-def select_azure_base_url_or_endpoint(azure_client_params: dict):
-    azure_endpoint = azure_client_params.get("azure_endpoint", None)
-    if azure_endpoint is not None:
-        # see : https://github.com/openai/openai-python/blob/3d61ed42aba652b547029095a7eb269ad4e1e957/src/openai/lib/azure.py#L192
-        if "/openai/deployments" in azure_endpoint:
-            # this is base_url, not an azure_endpoint
-            azure_client_params["base_url"] = azure_endpoint
-            azure_client_params.pop("azure_endpoint")
-
-    return azure_client_params
-
-
-def get_azure_ad_token_from_oidc(azure_ad_token: str):
-    azure_client_id = os.getenv("AZURE_CLIENT_ID", None)
-    azure_tenant_id = os.getenv("AZURE_TENANT_ID", None)
-    azure_authority_host = os.getenv(
-        "AZURE_AUTHORITY_HOST", "https://login.microsoftonline.com"
-    )
-
-    if azure_client_id is None or azure_tenant_id is None:
-        raise AzureOpenAIError(
-            status_code=422,
-            message="AZURE_CLIENT_ID and AZURE_TENANT_ID must be set",
-        )
-
-    oidc_token = get_secret(azure_ad_token)
-
-    if oidc_token is None:
-        raise AzureOpenAIError(
-            status_code=401,
-            message="OIDC token could not be retrieved from secret manager.",
-        )
-
-    azure_ad_token_cache_key = json.dumps(
-        {
-            "azure_client_id": azure_client_id,
-            "azure_tenant_id": azure_tenant_id,
-            "azure_authority_host": azure_authority_host,
-            "oidc_token": oidc_token,
-        }
-    )
-
-    azure_ad_token_access_token = azure_ad_cache.get_cache(azure_ad_token_cache_key)
-    if azure_ad_token_access_token is not None:
-        return azure_ad_token_access_token
-
-    client = litellm.module_level_client
-    req_token = client.post(
-        f"{azure_authority_host}/{azure_tenant_id}/oauth2/v2.0/token",
-        data={
-            "client_id": azure_client_id,
-            "grant_type": "client_credentials",
-            "scope": "https://cognitiveservices.azure.com/.default",
-            "client_assertion_type": "urn:ietf:params:oauth:client-assertion-type:jwt-bearer",
-            "client_assertion": oidc_token,
-        },
-    )
-
-    if req_token.status_code != 200:
-        raise AzureOpenAIError(
-            status_code=req_token.status_code,
-            message=req_token.text,
-        )
-
-    azure_ad_token_json = req_token.json()
-    azure_ad_token_access_token = azure_ad_token_json.get("access_token", None)
-    azure_ad_token_expires_in = azure_ad_token_json.get("expires_in", None)
-
-    if azure_ad_token_access_token is None:
-        raise AzureOpenAIError(
-            status_code=422, message="Azure AD Token access_token not returned"
-        )
-
-    if azure_ad_token_expires_in is None:
-        raise AzureOpenAIError(
-            status_code=422, message="Azure AD Token expires_in not returned"
-        )
-
-    azure_ad_cache.set_cache(
-        key=azure_ad_token_cache_key,
-        value=azure_ad_token_access_token,
-        ttl=azure_ad_token_expires_in,
-    )
-
-    return azure_ad_token_access_token
-
-
 def _check_dynamic_azure_params(
    azure_client_params: dict,
    azure_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]],
@ -206,7 +120,7 @@ def _check_dynamic_azure_params(
    return False


-class AzureChatCompletion(BaseLLM):
+class AzureChatCompletion(BaseAzureLLM, BaseLLM):
    def __init__(self) -> None:
        super().__init__()

@ -238,27 +152,16 @@ class AzureChatCompletion(BaseLLM):
        timeout: Union[float, httpx.Timeout],
        client: Optional[Any],
        client_type: Literal["sync", "async"],
+        litellm_params: Optional[dict] = None,
    ):
        # init AzureOpenAI Client
-        azure_client_params: Dict[str, Any] = {
-            "api_version": api_version,
-            "azure_endpoint": api_base,
-            "azure_deployment": model,
-            "http_client": litellm.client_session,
-            "max_retries": max_retries,
-            "timeout": timeout,
-        }
-        azure_client_params = select_azure_base_url_or_endpoint(
-            azure_client_params=azure_client_params
+        azure_client_params: Dict[str, Any] = self.initialize_azure_sdk_client(
+            litellm_params=litellm_params or {},
+            api_key=api_key,
+            model_name=model,
+            api_version=api_version,
+            api_base=api_base,
        )
-        if api_key is not None:
-            azure_client_params["api_key"] = api_key
-        elif azure_ad_token is not None:
-            if azure_ad_token.startswith("oidc/"):
-                azure_ad_token = get_azure_ad_token_from_oidc(azure_ad_token)
-            azure_client_params["azure_ad_token"] = azure_ad_token
-        elif azure_ad_token_provider is not None:
-            azure_client_params["azure_ad_token_provider"] = azure_ad_token_provider
        if client is None:
            if client_type == "sync":
                azure_client = AzureOpenAI(**azure_client_params)  # type: ignore
@ -357,6 +260,13 @@ class AzureChatCompletion(BaseLLM):
                max_retries = DEFAULT_MAX_RETRIES
            json_mode: Optional[bool] = optional_params.pop("json_mode", False)

+            azure_client_params = self.initialize_azure_sdk_client(
+                litellm_params=litellm_params or {},
+                api_key=api_key,
+                api_base=api_base,
+                model_name=model,
+                api_version=api_version,
+            )
            ### CHECK IF CLOUDFLARE AI GATEWAY ###
            ### if so - set the model as part of the base url
            if "gateway.ai.cloudflare.com" in api_base:
@ -417,6 +327,7 @@ class AzureChatCompletion(BaseLLM):
                        timeout=timeout,
                        client=client,
                        max_retries=max_retries,
+                        azure_client_params=azure_client_params,
                    )
                else:
                    return self.acompletion(
@ -434,6 +345,7 @@ class AzureChatCompletion(BaseLLM):
                        logging_obj=logging_obj,
                        max_retries=max_retries,
                        convert_tool_call_to_json_mode=json_mode,
+                        azure_client_params=azure_client_params,
                    )
            elif "stream" in optional_params and optional_params["stream"] is True:
                return self.streaming(
@ -470,28 +382,6 @@ class AzureChatCompletion(BaseLLM):
                        status_code=422, message="max retries must be an int"
                    )
                # init AzureOpenAI Client
-                azure_client_params = {
-                    "api_version": api_version,
-                    "azure_endpoint": api_base,
-                    "azure_deployment": model,
-                    "http_client": litellm.client_session,
-                    "max_retries": max_retries,
-                    "timeout": timeout,
-                }
-                azure_client_params = select_azure_base_url_or_endpoint(
-                    azure_client_params=azure_client_params
-                )
-                if api_key is not None:
-                    azure_client_params["api_key"] = api_key
-                elif azure_ad_token is not None:
-                    if azure_ad_token.startswith("oidc/"):
-                        azure_ad_token = get_azure_ad_token_from_oidc(azure_ad_token)
-                    azure_client_params["azure_ad_token"] = azure_ad_token
-                elif azure_ad_token_provider is not None:
-                    azure_client_params["azure_ad_token_provider"] = (
-                        azure_ad_token_provider
-                    )
-
                if (
                    client is None
                    or not isinstance(client, AzureOpenAI)
@ -540,10 +430,14 @@ class AzureChatCompletion(BaseLLM):
            status_code = getattr(e, "status_code", 500)
            error_headers = getattr(e, "headers", None)
            error_response = getattr(e, "response", None)
+            error_body = getattr(e, "body", None)
            if error_headers is None and error_response:
                error_headers = getattr(error_response, "headers", None)
            raise AzureOpenAIError(
-                status_code=status_code, message=str(e), headers=error_headers
+                status_code=status_code,
+                message=str(e),
+                headers=error_headers,
+                body=error_body,
            )

    async def acompletion(
@ -562,30 +456,10 @@ class AzureChatCompletion(BaseLLM):
        azure_ad_token_provider: Optional[Callable] = None,
        convert_tool_call_to_json_mode: Optional[bool] = None,
        client=None,  # this is the AsyncAzureOpenAI
+        azure_client_params: dict = {},
    ):
        response = None
        try:
-            # init AzureOpenAI Client
-            azure_client_params = {
-                "api_version": api_version,
-                "azure_endpoint": api_base,
-                "azure_deployment": model,
-                "http_client": litellm.aclient_session,
-                "max_retries": max_retries,
-                "timeout": timeout,
-            }
-            azure_client_params = select_azure_base_url_or_endpoint(
-                azure_client_params=azure_client_params
-            )
-            if api_key is not None:
-                azure_client_params["api_key"] = api_key
-            elif azure_ad_token is not None:
-                if azure_ad_token.startswith("oidc/"):
-                    azure_ad_token = get_azure_ad_token_from_oidc(azure_ad_token)
-                azure_client_params["azure_ad_token"] = azure_ad_token
-            elif azure_ad_token_provider is not None:
-                azure_client_params["azure_ad_token_provider"] = azure_ad_token_provider
-
            # setting Azure client
            if client is None or dynamic_params:
                azure_client = AsyncAzureOpenAI(**azure_client_params)
@ -649,6 +523,7 @@ class AzureChatCompletion(BaseLLM):
            raise AzureOpenAIError(status_code=500, message=str(e))
        except Exception as e:
            message = getattr(e, "message", str(e))
+            body = getattr(e, "body", None)
            ## LOGGING
            logging_obj.post_call(
                input=data["messages"],
@ -659,7 +534,7 @@ class AzureChatCompletion(BaseLLM):
            if hasattr(e, "status_code"):
                raise e
            else:
-                raise AzureOpenAIError(status_code=500, message=message)
+                raise AzureOpenAIError(status_code=500, message=message, body=body)

    def streaming(
        self,
@ -742,28 +617,9 @@ class AzureChatCompletion(BaseLLM):
        azure_ad_token: Optional[str] = None,
        azure_ad_token_provider: Optional[Callable] = None,
        client=None,
+        azure_client_params: dict = {},
    ):
        try:
-            # init AzureOpenAI Client
-            azure_client_params = {
-                "api_version": api_version,
-                "azure_endpoint": api_base,
-                "azure_deployment": model,
-                "http_client": litellm.aclient_session,
-                "max_retries": max_retries,
-                "timeout": timeout,
-            }
-            azure_client_params = select_azure_base_url_or_endpoint(
-                azure_client_params=azure_client_params
-            )
-            if api_key is not None:
-                azure_client_params["api_key"] = api_key
-            elif azure_ad_token is not None:
-                if azure_ad_token.startswith("oidc/"):
-                    azure_ad_token = get_azure_ad_token_from_oidc(azure_ad_token)
-                azure_client_params["azure_ad_token"] = azure_ad_token
-            elif azure_ad_token_provider is not None:
-                azure_client_params["azure_ad_token_provider"] = azure_ad_token_provider
            if client is None or dynamic_params:
                azure_client = AsyncAzureOpenAI(**azure_client_params)
            else:
@ -805,10 +661,14 @@ class AzureChatCompletion(BaseLLM):
            error_headers = getattr(e, "headers", None)
            error_response = getattr(e, "response", None)
            message = getattr(e, "message", str(e))
+            error_body = getattr(e, "body", None)
            if error_headers is None and error_response:
                error_headers = getattr(error_response, "headers", None)
            raise AzureOpenAIError(
-                status_code=status_code, message=message, headers=error_headers
+                status_code=status_code,
+                message=message,
+                headers=error_headers,
+                body=error_body,
            )

    async def aembedding(
@ -824,6 +684,7 @@ class AzureChatCompletion(BaseLLM):
    ):
        response = None
        try:
+
            if client is None:
                openai_aclient = AsyncAzureOpenAI(**azure_client_params)
            else:
@ -875,6 +736,7 @@ class AzureChatCompletion(BaseLLM):
        client=None,
        aembedding=None,
        headers: Optional[dict] = None,
+        litellm_params: Optional[dict] = None,
    ) -> EmbeddingResponse:
        if headers:
            optional_params["extra_headers"] = headers
@ -890,29 +752,14 @@ class AzureChatCompletion(BaseLLM):
                )

            # init AzureOpenAI Client
-            azure_client_params = {
-                "api_version": api_version,
-                "azure_endpoint": api_base,
-                "azure_deployment": model,
-                "max_retries": max_retries,
-                "timeout": timeout,
-            }
-            azure_client_params = select_azure_base_url_or_endpoint(
-                azure_client_params=azure_client_params
-            )
-            if aembedding:
-                azure_client_params["http_client"] = litellm.aclient_session
-            else:
-                azure_client_params["http_client"] = litellm.client_session
-            if api_key is not None:
-                azure_client_params["api_key"] = api_key
-            elif azure_ad_token is not None:
-                if azure_ad_token.startswith("oidc/"):
-                    azure_ad_token = get_azure_ad_token_from_oidc(azure_ad_token)
-                azure_client_params["azure_ad_token"] = azure_ad_token
-            elif azure_ad_token_provider is not None:
-                azure_client_params["azure_ad_token_provider"] = azure_ad_token_provider

+            azure_client_params = self.initialize_azure_sdk_client(
+                litellm_params=litellm_params or {},
+                api_key=api_key,
+                model_name=model,
+                api_version=api_version,
+                api_base=api_base,
+            )
            ## LOGGING
            logging_obj.pre_call(
                input=input,
@ -1272,6 +1119,7 @@ class AzureChatCompletion(BaseLLM):
        azure_ad_token_provider: Optional[Callable] = None,
        client=None,
        aimg_generation=None,
+        litellm_params: Optional[dict] = None,
    ) -> ImageResponse:
        try:
            if model and len(model) > 0:
@ -1296,25 +1144,13 @@ class AzureChatCompletion(BaseLLM):
                )

            # init AzureOpenAI Client
-            azure_client_params: Dict[str, Any] = {
-                "api_version": api_version,
-                "azure_endpoint": api_base,
-                "azure_deployment": model,
-                "max_retries": max_retries,
-                "timeout": timeout,
-            }
-            azure_client_params = select_azure_base_url_or_endpoint(
-                azure_client_params=azure_client_params
+            azure_client_params: Dict[str, Any] = self.initialize_azure_sdk_client(
+                litellm_params=litellm_params or {},
+                api_key=api_key,
+                model_name=model or "",
+                api_version=api_version,
+                api_base=api_base,
            )
-            if api_key is not None:
-                azure_client_params["api_key"] = api_key
-            elif azure_ad_token is not None:
-                if azure_ad_token.startswith("oidc/"):
-                    azure_ad_token = get_azure_ad_token_from_oidc(azure_ad_token)
-                azure_client_params["azure_ad_token"] = azure_ad_token
-            elif azure_ad_token_provider is not None:
-                azure_client_params["azure_ad_token_provider"] = azure_ad_token_provider
-
            if aimg_generation is True:
                return self.aimage_generation(data=data, input=input, logging_obj=logging_obj, model_response=model_response, api_key=api_key, client=client, azure_client_params=azure_client_params, timeout=timeout, headers=headers)  # type: ignore

@ -1377,6 +1213,7 @@ class AzureChatCompletion(BaseLLM):
        azure_ad_token_provider: Optional[Callable] = None,
        aspeech: Optional[bool] = None,
        client=None,
+        litellm_params: Optional[dict] = None,
    ) -> HttpxBinaryResponseContent:

        max_retries = optional_params.pop("max_retries", 2)
@ -1395,6 +1232,7 @@ class AzureChatCompletion(BaseLLM):
                max_retries=max_retries,
                timeout=timeout,
                client=client,
+                litellm_params=litellm_params,
            )  # type: ignore

        azure_client: AzureOpenAI = self._get_sync_azure_client(
@ -1408,6 +1246,7 @@ class AzureChatCompletion(BaseLLM):
            timeout=timeout,
            client=client,
            client_type="sync",
+            litellm_params=litellm_params,
        )  # type: ignore

        response = azure_client.audio.speech.create(
@ -1432,6 +1271,7 @@ class AzureChatCompletion(BaseLLM):
        max_retries: int,
        timeout: Union[float, httpx.Timeout],
        client=None,
+        litellm_params: Optional[dict] = None,
    ) -> HttpxBinaryResponseContent:

        azure_client: AsyncAzureOpenAI = self._get_sync_azure_client(
@ -1445,6 +1285,7 @@ class AzureChatCompletion(BaseLLM):
            timeout=timeout,
            client=client,
            client_type="async",
+            litellm_params=litellm_params,
        )  # type: ignore

        azure_response = await azure_client.audio.speech.create(
--- a/litellm/llms/azure/batches/handler.py
+++ b/litellm/llms/azure/batches/handler.py
@ -2,11 +2,10 @@
 Azure Batches API Handler
 """

-from typing import Any, Coroutine, Optional, Union
+from typing import Any, Coroutine, Optional, Union, cast

 import httpx

-import litellm
 from litellm.llms.azure.azure import AsyncAzureOpenAI, AzureOpenAI
 from litellm.types.llms.openai import (
    Batch,
@ -14,9 +13,12 @@ from litellm.types.llms.openai import (
    CreateBatchRequest,
    RetrieveBatchRequest,
 )
+from litellm.types.utils import LiteLLMBatch
+
+from ..common_utils import BaseAzureLLM


-class AzureBatchesAPI:
+class AzureBatchesAPI(BaseAzureLLM):
    """
    Azure methods to support for batches
    - create_batch()
@ -28,45 +30,13 @@ class AzureBatchesAPI:
    def __init__(self) -> None:
        super().__init__()

-    def get_azure_openai_client(
-        self,
-        api_key: Optional[str],
-        api_base: Optional[str],
-        timeout: Union[float, httpx.Timeout],
-        max_retries: Optional[int],
-        api_version: Optional[str] = None,
-        client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
-        _is_async: bool = False,
-    ) -> Optional[Union[AzureOpenAI, AsyncAzureOpenAI]]:
-        received_args = locals()
-        openai_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None
-        if client is None:
-            data = {}
-            for k, v in received_args.items():
-                if k == "self" or k == "client" or k == "_is_async":
-                    pass
-                elif k == "api_base" and v is not None:
-                    data["azure_endpoint"] = v
-                elif v is not None:
-                    data[k] = v
-            if "api_version" not in data:
-                data["api_version"] = litellm.AZURE_DEFAULT_API_VERSION
-            if _is_async is True:
-                openai_client = AsyncAzureOpenAI(**data)
-            else:
-                openai_client = AzureOpenAI(**data)  # type: ignore
-        else:
-            openai_client = client
-
-        return openai_client
-
    async def acreate_batch(
        self,
        create_batch_data: CreateBatchRequest,
        azure_client: AsyncAzureOpenAI,
-    ) -> Batch:
+    ) -> LiteLLMBatch:
        response = await azure_client.batches.create(**create_batch_data)
-        return response
+        return LiteLLMBatch(**response.model_dump())

    def create_batch(
        self,
@ -78,16 +48,16 @@ class AzureBatchesAPI:
        timeout: Union[float, httpx.Timeout],
        max_retries: Optional[int],
        client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
-    ) -> Union[Batch, Coroutine[Any, Any, Batch]]:
+        litellm_params: Optional[dict] = None,
+    ) -> Union[LiteLLMBatch, Coroutine[Any, Any, LiteLLMBatch]]:
        azure_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = (
            self.get_azure_openai_client(
                api_key=api_key,
                api_base=api_base,
-                timeout=timeout,
                api_version=api_version,
-                max_retries=max_retries,
                client=client,
                _is_async=_is_async,
+                litellm_params=litellm_params or {},
            )
        )
        if azure_client is None:
@ -103,16 +73,16 @@ class AzureBatchesAPI:
            return self.acreate_batch(  # type: ignore
                create_batch_data=create_batch_data, azure_client=azure_client
            )
-        response = azure_client.batches.create(**create_batch_data)
-        return response
+        response = cast(AzureOpenAI, azure_client).batches.create(**create_batch_data)
+        return LiteLLMBatch(**response.model_dump())

    async def aretrieve_batch(
        self,
        retrieve_batch_data: RetrieveBatchRequest,
        client: AsyncAzureOpenAI,
-    ) -> Batch:
+    ) -> LiteLLMBatch:
        response = await client.batches.retrieve(**retrieve_batch_data)
-        return response
+        return LiteLLMBatch(**response.model_dump())

    def retrieve_batch(
        self,
@ -124,16 +94,16 @@ class AzureBatchesAPI:
        timeout: Union[float, httpx.Timeout],
        max_retries: Optional[int],
        client: Optional[AzureOpenAI] = None,
+        litellm_params: Optional[dict] = None,
    ):
        azure_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = (
            self.get_azure_openai_client(
                api_key=api_key,
                api_base=api_base,
                api_version=api_version,
-                timeout=timeout,
-                max_retries=max_retries,
                client=client,
                _is_async=_is_async,
+                litellm_params=litellm_params or {},
            )
        )
        if azure_client is None:
@ -149,8 +119,10 @@ class AzureBatchesAPI:
            return self.aretrieve_batch(  # type: ignore
                retrieve_batch_data=retrieve_batch_data, client=azure_client
            )
-        response = azure_client.batches.retrieve(**retrieve_batch_data)
-        return response
+        response = cast(AzureOpenAI, azure_client).batches.retrieve(
+            **retrieve_batch_data
+        )
+        return LiteLLMBatch(**response.model_dump())

    async def acancel_batch(
        self,
@ -170,16 +142,16 @@ class AzureBatchesAPI:
        timeout: Union[float, httpx.Timeout],
        max_retries: Optional[int],
        client: Optional[AzureOpenAI] = None,
+        litellm_params: Optional[dict] = None,
    ):
        azure_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = (
            self.get_azure_openai_client(
                api_key=api_key,
                api_base=api_base,
                api_version=api_version,
-                timeout=timeout,
-                max_retries=max_retries,
                client=client,
                _is_async=_is_async,
+                litellm_params=litellm_params or {},
            )
        )
        if azure_client is None:
@ -209,16 +181,16 @@ class AzureBatchesAPI:
        after: Optional[str] = None,
        limit: Optional[int] = None,
        client: Optional[AzureOpenAI] = None,
+        litellm_params: Optional[dict] = None,
    ):
        azure_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = (
            self.get_azure_openai_client(
                api_key=api_key,
                api_base=api_base,
-                timeout=timeout,
-                max_retries=max_retries,
                api_version=api_version,
                client=client,
                _is_async=_is_async,
+                litellm_params=litellm_params or {},
            )
        )
        if azure_client is None:
--- a/litellm/llms/azure/chat/o_series_handler.py
+++ b/litellm/llms/azure/chat/o_series_handler.py
@ -4,50 +4,69 @@ Handler file for calls to Azure OpenAI's o1/o3 family of models
 Written separately to handle faking streaming for o1 and o3 models.
 """

-from typing import Optional, Union
+from typing import Any, Callable, Optional, Union

 import httpx
-from openai import AsyncAzureOpenAI, AsyncOpenAI, AzureOpenAI, OpenAI
+
+from litellm.types.utils import ModelResponse

 from ...openai.openai import OpenAIChatCompletion
-from ..common_utils import get_azure_openai_client
+from ..common_utils import BaseAzureLLM


-class AzureOpenAIO1ChatCompletion(OpenAIChatCompletion):
-    def _get_openai_client(
+class AzureOpenAIO1ChatCompletion(BaseAzureLLM, OpenAIChatCompletion):
+    def completion(
        self,
-        is_async: bool,
+        model_response: ModelResponse,
+        timeout: Union[float, httpx.Timeout],
+        optional_params: dict,
+        litellm_params: dict,
+        logging_obj: Any,
+        model: Optional[str] = None,
+        messages: Optional[list] = None,
+        print_verbose: Optional[Callable] = None,
        api_key: Optional[str] = None,
        api_base: Optional[str] = None,
        api_version: Optional[str] = None,
-        timeout: Union[float, httpx.Timeout] = httpx.Timeout(None),
-        max_retries: Optional[int] = 2,
+        dynamic_params: Optional[bool] = None,
+        azure_ad_token: Optional[str] = None,
+        acompletion: bool = False,
+        logger_fn=None,
+        headers: Optional[dict] = None,
+        custom_prompt_dict: dict = {},
+        client=None,
        organization: Optional[str] = None,
-        client: Optional[
-            Union[OpenAI, AsyncOpenAI, AzureOpenAI, AsyncAzureOpenAI]
-        ] = None,
-    ) -> Optional[
-        Union[
-            OpenAI,
-            AsyncOpenAI,
-            AzureOpenAI,
-            AsyncAzureOpenAI,
-        ]
-    ]:
-
-        # Override to use Azure-specific client initialization
-        if not isinstance(client, AzureOpenAI) and not isinstance(
-            client, AsyncAzureOpenAI
-        ):
-            client = None
-
-        return get_azure_openai_client(
+        custom_llm_provider: Optional[str] = None,
+        drop_params: Optional[bool] = None,
+    ):
+        client = self.get_azure_openai_client(
+            litellm_params=litellm_params,
            api_key=api_key,
            api_base=api_base,
-            timeout=timeout,
-            max_retries=max_retries,
-            organization=organization,
            api_version=api_version,
            client=client,
-            _is_async=is_async,
+            _is_async=acompletion,
+        )
+        return super().completion(
+            model_response=model_response,
+            timeout=timeout,
+            optional_params=optional_params,
+            litellm_params=litellm_params,
+            logging_obj=logging_obj,
+            model=model,
+            messages=messages,
+            print_verbose=print_verbose,
+            api_key=api_key,
+            api_base=api_base,
+            api_version=api_version,
+            dynamic_params=dynamic_params,
+            azure_ad_token=azure_ad_token,
+            acompletion=acompletion,
+            logger_fn=logger_fn,
+            headers=headers,
+            custom_prompt_dict=custom_prompt_dict,
+            client=client,
+            organization=organization,
+            custom_llm_provider=custom_llm_provider,
+            drop_params=drop_params,
        )
--- a/litellm/llms/azure/common_utils.py
+++ b/litellm/llms/azure/common_utils.py
@ -1,3 +1,5 @@
+import json
+import os
 from typing import Callable, Optional, Union

 import httpx
@ -5,9 +7,15 @@ from openai import AsyncAzureOpenAI, AzureOpenAI

 import litellm
 from litellm._logging import verbose_logger
+from litellm.caching.caching import DualCache
 from litellm.llms.base_llm.chat.transformation import BaseLLMException
+from litellm.secret_managers.get_azure_ad_token_provider import (
+    get_azure_ad_token_provider,
+)
 from litellm.secret_managers.main import get_secret_str

+azure_ad_cache = DualCache()
+

 class AzureOpenAIError(BaseLLMException):
    def __init__(
@ -17,6 +25,7 @@ class AzureOpenAIError(BaseLLMException):
        request: Optional[httpx.Request] = None,
        response: Optional[httpx.Response] = None,
        headers: Optional[Union[httpx.Headers, dict]] = None,
+        body: Optional[dict] = None,
    ):
        super().__init__(
            status_code=status_code,
@ -24,42 +33,10 @@ class AzureOpenAIError(BaseLLMException):
            request=request,
            response=response,
            headers=headers,
+            body=body,
        )


-def get_azure_openai_client(
-    api_key: Optional[str],
-    api_base: Optional[str],
-    timeout: Union[float, httpx.Timeout],
-    max_retries: Optional[int],
-    api_version: Optional[str] = None,
-    organization: Optional[str] = None,
-    client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
-    _is_async: bool = False,
-) -> Optional[Union[AzureOpenAI, AsyncAzureOpenAI]]:
-    received_args = locals()
-    openai_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None
-    if client is None:
-        data = {}
-        for k, v in received_args.items():
-            if k == "self" or k == "client" or k == "_is_async":
-                pass
-            elif k == "api_base" and v is not None:
-                data["azure_endpoint"] = v
-            elif v is not None:
-                data[k] = v
-        if "api_version" not in data:
-            data["api_version"] = litellm.AZURE_DEFAULT_API_VERSION
-        if _is_async is True:
-            openai_client = AsyncAzureOpenAI(**data)
-        else:
-            openai_client = AzureOpenAI(**data)  # type: ignore
-    else:
-        openai_client = client
-
-    return openai_client
-
-
 def process_azure_headers(headers: Union[httpx.Headers, dict]) -> dict:
    openai_headers = {}
    if "x-ratelimit-limit-requests" in headers:
@ -178,3 +155,199 @@ def get_azure_ad_token_from_username_password(
    verbose_logger.debug("token_provider %s", token_provider)

    return token_provider
+
+
+def get_azure_ad_token_from_oidc(azure_ad_token: str):
+    azure_client_id = os.getenv("AZURE_CLIENT_ID", None)
+    azure_tenant_id = os.getenv("AZURE_TENANT_ID", None)
+    azure_authority_host = os.getenv(
+        "AZURE_AUTHORITY_HOST", "https://login.microsoftonline.com"
+    )
+
+    if azure_client_id is None or azure_tenant_id is None:
+        raise AzureOpenAIError(
+            status_code=422,
+            message="AZURE_CLIENT_ID and AZURE_TENANT_ID must be set",
+        )
+
+    oidc_token = get_secret_str(azure_ad_token)
+
+    if oidc_token is None:
+        raise AzureOpenAIError(
+            status_code=401,
+            message="OIDC token could not be retrieved from secret manager.",
+        )
+
+    azure_ad_token_cache_key = json.dumps(
+        {
+            "azure_client_id": azure_client_id,
+            "azure_tenant_id": azure_tenant_id,
+            "azure_authority_host": azure_authority_host,
+            "oidc_token": oidc_token,
+        }
+    )
+
+    azure_ad_token_access_token = azure_ad_cache.get_cache(azure_ad_token_cache_key)
+    if azure_ad_token_access_token is not None:
+        return azure_ad_token_access_token
+
+    client = litellm.module_level_client
+    req_token = client.post(
+        f"{azure_authority_host}/{azure_tenant_id}/oauth2/v2.0/token",
+        data={
+            "client_id": azure_client_id,
+            "grant_type": "client_credentials",
+            "scope": "https://cognitiveservices.azure.com/.default",
+            "client_assertion_type": "urn:ietf:params:oauth:client-assertion-type:jwt-bearer",
+            "client_assertion": oidc_token,
+        },
+    )
+
+    if req_token.status_code != 200:
+        raise AzureOpenAIError(
+            status_code=req_token.status_code,
+            message=req_token.text,
+        )
+
+    azure_ad_token_json = req_token.json()
+    azure_ad_token_access_token = azure_ad_token_json.get("access_token", None)
+    azure_ad_token_expires_in = azure_ad_token_json.get("expires_in", None)
+
+    if azure_ad_token_access_token is None:
+        raise AzureOpenAIError(
+            status_code=422, message="Azure AD Token access_token not returned"
+        )
+
+    if azure_ad_token_expires_in is None:
+        raise AzureOpenAIError(
+            status_code=422, message="Azure AD Token expires_in not returned"
+        )
+
+    azure_ad_cache.set_cache(
+        key=azure_ad_token_cache_key,
+        value=azure_ad_token_access_token,
+        ttl=azure_ad_token_expires_in,
+    )
+
+    return azure_ad_token_access_token
+
+
+def select_azure_base_url_or_endpoint(azure_client_params: dict):
+    azure_endpoint = azure_client_params.get("azure_endpoint", None)
+    if azure_endpoint is not None:
+        # see : https://github.com/openai/openai-python/blob/3d61ed42aba652b547029095a7eb269ad4e1e957/src/openai/lib/azure.py#L192
+        if "/openai/deployments" in azure_endpoint:
+            # this is base_url, not an azure_endpoint
+            azure_client_params["base_url"] = azure_endpoint
+            azure_client_params.pop("azure_endpoint")
+
+    return azure_client_params
+
+
+class BaseAzureLLM:
+    def get_azure_openai_client(
+        self,
+        litellm_params: dict,
+        api_key: Optional[str],
+        api_base: Optional[str],
+        api_version: Optional[str] = None,
+        client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
+        _is_async: bool = False,
+    ) -> Optional[Union[AzureOpenAI, AsyncAzureOpenAI]]:
+        openai_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None
+        if client is None:
+            azure_client_params = self.initialize_azure_sdk_client(
+                litellm_params=litellm_params,
+                api_key=api_key,
+                api_base=api_base,
+                model_name="",
+                api_version=api_version,
+            )
+            if _is_async is True:
+                openai_client = AsyncAzureOpenAI(**azure_client_params)
+            else:
+                openai_client = AzureOpenAI(**azure_client_params)  # type: ignore
+        else:
+            openai_client = client
+
+        return openai_client
+
+    def initialize_azure_sdk_client(
+        self,
+        litellm_params: dict,
+        api_key: Optional[str],
+        api_base: Optional[str],
+        model_name: str,
+        api_version: Optional[str],
+    ) -> dict:
+
+        azure_ad_token_provider: Optional[Callable[[], str]] = None
+        # If we have api_key, then we have higher priority
+        azure_ad_token = litellm_params.get("azure_ad_token")
+        tenant_id = litellm_params.get("tenant_id")
+        client_id = litellm_params.get("client_id")
+        client_secret = litellm_params.get("client_secret")
+        azure_username = litellm_params.get("azure_username")
+        azure_password = litellm_params.get("azure_password")
+        max_retries = litellm_params.get("max_retries")
+        timeout = litellm_params.get("timeout")
+        if not api_key and tenant_id and client_id and client_secret:
+            verbose_logger.debug("Using Azure AD Token Provider for Azure Auth")
+            azure_ad_token_provider = get_azure_ad_token_from_entrata_id(
+                tenant_id=tenant_id,
+                client_id=client_id,
+                client_secret=client_secret,
+            )
+        if azure_username and azure_password and client_id:
+            azure_ad_token_provider = get_azure_ad_token_from_username_password(
+                azure_username=azure_username,
+                azure_password=azure_password,
+                client_id=client_id,
+            )
+
+        if azure_ad_token is not None and azure_ad_token.startswith("oidc/"):
+            azure_ad_token = get_azure_ad_token_from_oidc(azure_ad_token)
+        elif (
+            not api_key
+            and azure_ad_token_provider is None
+            and litellm.enable_azure_ad_token_refresh is True
+        ):
+            try:
+                azure_ad_token_provider = get_azure_ad_token_provider()
+            except ValueError:
+                verbose_logger.debug("Azure AD Token Provider could not be used.")
+        if api_version is None:
+            api_version = os.getenv(
+                "AZURE_API_VERSION", litellm.AZURE_DEFAULT_API_VERSION
+            )
+
+        _api_key = api_key
+        if _api_key is not None and isinstance(_api_key, str):
+            # only show first 5 chars of api_key
+            _api_key = _api_key[:8] + "*" * 15
+        verbose_logger.debug(
+            f"Initializing Azure OpenAI Client for {model_name}, Api Base: {str(api_base)}, Api Key:{_api_key}"
+        )
+        azure_client_params = {
+            "api_key": api_key,
+            "azure_endpoint": api_base,
+            "api_version": api_version,
+            "azure_ad_token": azure_ad_token,
+            "azure_ad_token_provider": azure_ad_token_provider,
+            "http_client": litellm.client_session,
+        }
+        if max_retries is not None:
+            azure_client_params["max_retries"] = max_retries
+        if timeout is not None:
+            azure_client_params["timeout"] = timeout
+
+        if azure_ad_token_provider is not None:
+            azure_client_params["azure_ad_token_provider"] = azure_ad_token_provider
+        # this decides if we should set azure_endpoint or base_url on Azure OpenAI Client
+        # required to support GPT-4 vision enhancements, since base_url needs to be set on Azure OpenAI Client
+
+        azure_client_params = select_azure_base_url_or_endpoint(
+            azure_client_params=azure_client_params
+        )
+
+        return azure_client_params
--- a/litellm/llms/azure/completion/handler.py
+++ b/litellm/llms/azure/completion/handler.py
@ -6,9 +6,8 @@ import litellm
 from litellm.litellm_core_utils.prompt_templates.factory import prompt_factory
 from litellm.utils import CustomStreamWrapper, ModelResponse, TextCompletionResponse

-from ...base import BaseLLM
 from ...openai.completion.transformation import OpenAITextCompletionConfig
-from ..common_utils import AzureOpenAIError
+from ..common_utils import AzureOpenAIError, BaseAzureLLM

 openai_text_completion_config = OpenAITextCompletionConfig()

@ -25,7 +24,7 @@ def select_azure_base_url_or_endpoint(azure_client_params: dict):
    return azure_client_params


-class AzureTextCompletion(BaseLLM):
+class AzureTextCompletion(BaseAzureLLM):
    def __init__(self) -> None:
        super().__init__()

@ -60,7 +59,6 @@ class AzureTextCompletion(BaseLLM):
        headers: Optional[dict] = None,
        client=None,
    ):
-        super().completion()
        try:
            if model is None or messages is None:
                raise AzureOpenAIError(
@ -72,6 +70,14 @@ class AzureTextCompletion(BaseLLM):
                messages=messages, model=model, custom_llm_provider="azure_text"
            )

+            azure_client_params = self.initialize_azure_sdk_client(
+                litellm_params=litellm_params or {},
+                api_key=api_key,
+                model_name=model,
+                api_version=api_version,
+                api_base=api_base,
+            )
+
            ### CHECK IF CLOUDFLARE AI GATEWAY ###
            ### if so - set the model as part of the base url
            if "gateway.ai.cloudflare.com" in api_base:
@ -118,6 +124,7 @@ class AzureTextCompletion(BaseLLM):
                        azure_ad_token=azure_ad_token,
                        timeout=timeout,
                        client=client,
+                        azure_client_params=azure_client_params,
                    )
                else:
                    return self.acompletion(
@ -132,6 +139,7 @@ class AzureTextCompletion(BaseLLM):
                        client=client,
                        logging_obj=logging_obj,
                        max_retries=max_retries,
+                        azure_client_params=azure_client_params,
                    )
            elif "stream" in optional_params and optional_params["stream"] is True:
                return self.streaming(
@ -144,6 +152,7 @@ class AzureTextCompletion(BaseLLM):
                    azure_ad_token=azure_ad_token,
                    timeout=timeout,
                    client=client,
+                    azure_client_params=azure_client_params,
                )
            else:
                ## LOGGING
@ -165,22 +174,6 @@ class AzureTextCompletion(BaseLLM):
                        status_code=422, message="max retries must be an int"
                    )
                # init AzureOpenAI Client
-                azure_client_params = {
-                    "api_version": api_version,
-                    "azure_endpoint": api_base,
-                    "azure_deployment": model,
-                    "http_client": litellm.client_session,
-                    "max_retries": max_retries,
-                    "timeout": timeout,
-                    "azure_ad_token_provider": azure_ad_token_provider,
-                }
-                azure_client_params = select_azure_base_url_or_endpoint(
-                    azure_client_params=azure_client_params
-                )
-                if api_key is not None:
-                    azure_client_params["api_key"] = api_key
-                elif azure_ad_token is not None:
-                    azure_client_params["azure_ad_token"] = azure_ad_token
                if client is None:
                    azure_client = AzureOpenAI(**azure_client_params)
                else:
@ -240,26 +233,11 @@ class AzureTextCompletion(BaseLLM):
        max_retries: int,
        azure_ad_token: Optional[str] = None,
        client=None,  # this is the AsyncAzureOpenAI
+        azure_client_params: dict = {},
    ):
        response = None
        try:
            # init AzureOpenAI Client
-            azure_client_params = {
-                "api_version": api_version,
-                "azure_endpoint": api_base,
-                "azure_deployment": model,
-                "http_client": litellm.client_session,
-                "max_retries": max_retries,
-                "timeout": timeout,
-            }
-            azure_client_params = select_azure_base_url_or_endpoint(
-                azure_client_params=azure_client_params
-            )
-            if api_key is not None:
-                azure_client_params["api_key"] = api_key
-            elif azure_ad_token is not None:
-                azure_client_params["azure_ad_token"] = azure_ad_token
-
            # setting Azure client
            if client is None:
                azure_client = AsyncAzureOpenAI(**azure_client_params)
@ -312,6 +290,7 @@ class AzureTextCompletion(BaseLLM):
        timeout: Any,
        azure_ad_token: Optional[str] = None,
        client=None,
+        azure_client_params: dict = {},
    ):
        max_retries = data.pop("max_retries", 2)
        if not isinstance(max_retries, int):
@ -319,21 +298,6 @@ class AzureTextCompletion(BaseLLM):
                status_code=422, message="max retries must be an int"
            )
        # init AzureOpenAI Client
-        azure_client_params = {
-            "api_version": api_version,
-            "azure_endpoint": api_base,
-            "azure_deployment": model,
-            "http_client": litellm.client_session,
-            "max_retries": max_retries,
-            "timeout": timeout,
-        }
-        azure_client_params = select_azure_base_url_or_endpoint(
-            azure_client_params=azure_client_params
-        )
-        if api_key is not None:
-            azure_client_params["api_key"] = api_key
-        elif azure_ad_token is not None:
-            azure_client_params["azure_ad_token"] = azure_ad_token
        if client is None:
            azure_client = AzureOpenAI(**azure_client_params)
        else:
@ -375,24 +339,10 @@ class AzureTextCompletion(BaseLLM):
        timeout: Any,
        azure_ad_token: Optional[str] = None,
        client=None,
+        azure_client_params: dict = {},
    ):
        try:
            # init AzureOpenAI Client
-            azure_client_params = {
-                "api_version": api_version,
-                "azure_endpoint": api_base,
-                "azure_deployment": model,
-                "http_client": litellm.client_session,
-                "max_retries": data.pop("max_retries", 2),
-                "timeout": timeout,
-            }
-            azure_client_params = select_azure_base_url_or_endpoint(
-                azure_client_params=azure_client_params
-            )
-            if api_key is not None:
-                azure_client_params["api_key"] = api_key
-            elif azure_ad_token is not None:
-                azure_client_params["azure_ad_token"] = azure_ad_token
            if client is None:
                azure_client = AsyncAzureOpenAI(**azure_client_params)
            else:
--- a/litellm/llms/azure/files/handler.py
+++ b/litellm/llms/azure/files/handler.py
@ -5,13 +5,12 @@ from openai import AsyncAzureOpenAI, AzureOpenAI
 from openai.types.file_deleted import FileDeleted

 from litellm._logging import verbose_logger
-from litellm.llms.base import BaseLLM
 from litellm.types.llms.openai import *

-from ..common_utils import get_azure_openai_client
+from ..common_utils import BaseAzureLLM


-class AzureOpenAIFilesAPI(BaseLLM):
+class AzureOpenAIFilesAPI(BaseAzureLLM):
    """
    AzureOpenAI methods to support for batches
    - create_file()
@ -45,14 +44,15 @@ class AzureOpenAIFilesAPI(BaseLLM):
        timeout: Union[float, httpx.Timeout],
        max_retries: Optional[int],
        client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
+        litellm_params: Optional[dict] = None,
    ) -> Union[FileObject, Coroutine[Any, Any, FileObject]]:
+
        openai_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = (
-            get_azure_openai_client(
+            self.get_azure_openai_client(
+                litellm_params=litellm_params or {},
                api_key=api_key,
                api_base=api_base,
                api_version=api_version,
-                timeout=timeout,
-                max_retries=max_retries,
                client=client,
                _is_async=_is_async,
            )
@ -91,17 +91,16 @@ class AzureOpenAIFilesAPI(BaseLLM):
        max_retries: Optional[int],
        api_version: Optional[str] = None,
        client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
+        litellm_params: Optional[dict] = None,
    ) -> Union[
        HttpxBinaryResponseContent, Coroutine[Any, Any, HttpxBinaryResponseContent]
    ]:
        openai_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = (
-            get_azure_openai_client(
+            self.get_azure_openai_client(
+                litellm_params=litellm_params or {},
                api_key=api_key,
                api_base=api_base,
-                timeout=timeout,
                api_version=api_version,
-                max_retries=max_retries,
-                organization=None,
                client=client,
                _is_async=_is_async,
            )
@ -144,14 +143,13 @@ class AzureOpenAIFilesAPI(BaseLLM):
        max_retries: Optional[int],
        api_version: Optional[str] = None,
        client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
+        litellm_params: Optional[dict] = None,
    ):
        openai_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = (
-            get_azure_openai_client(
+            self.get_azure_openai_client(
+                litellm_params=litellm_params or {},
                api_key=api_key,
                api_base=api_base,
-                timeout=timeout,
-                max_retries=max_retries,
-                organization=None,
                api_version=api_version,
                client=client,
                _is_async=_is_async,
@ -197,14 +195,13 @@ class AzureOpenAIFilesAPI(BaseLLM):
        organization: Optional[str] = None,
        api_version: Optional[str] = None,
        client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
+        litellm_params: Optional[dict] = None,
    ):
        openai_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = (
-            get_azure_openai_client(
+            self.get_azure_openai_client(
+                litellm_params=litellm_params or {},
                api_key=api_key,
                api_base=api_base,
-                timeout=timeout,
-                max_retries=max_retries,
-                organization=organization,
                api_version=api_version,
                client=client,
                _is_async=_is_async,
@ -252,14 +249,13 @@ class AzureOpenAIFilesAPI(BaseLLM):
        purpose: Optional[str] = None,
        api_version: Optional[str] = None,
        client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
+        litellm_params: Optional[dict] = None,
    ):
        openai_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = (
-            get_azure_openai_client(
+            self.get_azure_openai_client(
+                litellm_params=litellm_params or {},
                api_key=api_key,
                api_base=api_base,
-                timeout=timeout,
-                max_retries=max_retries,
-                organization=None,  # openai param
                api_version=api_version,
                client=client,
                _is_async=_is_async,
--- a/litellm/llms/azure/fine_tuning/handler.py
+++ b/litellm/llms/azure/fine_tuning/handler.py
@ -3,11 +3,11 @@ from typing import Optional, Union
 import httpx
 from openai import AsyncAzureOpenAI, AsyncOpenAI, AzureOpenAI, OpenAI

-from litellm.llms.azure.files.handler import get_azure_openai_client
+from litellm.llms.azure.common_utils import BaseAzureLLM
 from litellm.llms.openai.fine_tuning.handler import OpenAIFineTuningAPI


-class AzureOpenAIFineTuningAPI(OpenAIFineTuningAPI):
+class AzureOpenAIFineTuningAPI(OpenAIFineTuningAPI, BaseAzureLLM):
    """
    AzureOpenAI methods to support fine tuning, inherits from OpenAIFineTuningAPI.
    """
@ -24,6 +24,7 @@ class AzureOpenAIFineTuningAPI(OpenAIFineTuningAPI):
        ] = None,
        _is_async: bool = False,
        api_version: Optional[str] = None,
+        litellm_params: Optional[dict] = None,
    ) -> Optional[
        Union[
            OpenAI,
@ -36,12 +37,10 @@ class AzureOpenAIFineTuningAPI(OpenAIFineTuningAPI):
        if isinstance(client, OpenAI) or isinstance(client, AsyncOpenAI):
            client = None

-        return get_azure_openai_client(
+        return self.get_azure_openai_client(
+            litellm_params=litellm_params or {},
            api_key=api_key,
            api_base=api_base,
-            timeout=timeout,
-            max_retries=max_retries,
-            organization=organization,
            api_version=api_version,
            client=client,
            _is_async=_is_async,
--- a/litellm/llms/azure_ai/chat/transformation.py
+++ b/litellm/llms/azure_ai/chat/transformation.py
@ -16,10 +16,23 @@ from litellm.llms.openai.openai import OpenAIConfig
 from litellm.secret_managers.main import get_secret_str
 from litellm.types.llms.openai import AllMessageValues
 from litellm.types.utils import ModelResponse, ProviderField
-from litellm.utils import _add_path_to_api_base
+from litellm.utils import _add_path_to_api_base, supports_tool_choice


 class AzureAIStudioConfig(OpenAIConfig):
+    def get_supported_openai_params(self, model: str) -> List:
+        model_supports_tool_choice = True  # azure ai supports this by default
+        if not supports_tool_choice(model=f"azure_ai/{model}"):
+            model_supports_tool_choice = False
+        supported_params = super().get_supported_openai_params(model)
+        if not model_supports_tool_choice:
+            filtered_supported_params = []
+            for param in supported_params:
+                if param != "tool_choice":
+                    filtered_supported_params.append(param)
+            return filtered_supported_params
+        return supported_params
+
    def validate_environment(
        self,
        headers: dict,
@ -54,6 +67,7 @@ class AzureAIStudioConfig(OpenAIConfig):
        api_base: Optional[str],
        model: str,
        optional_params: dict,
+        litellm_params: dict,
        stream: Optional[bool] = None,
    ) -> str:
        """
@ -79,12 +93,14 @@ class AzureAIStudioConfig(OpenAIConfig):
        original_url = httpx.URL(api_base)

        # Extract api_version or use default
-        api_version = cast(Optional[str], optional_params.get("api_version"))
+        api_version = cast(Optional[str], litellm_params.get("api_version"))

-        # Check if 'api-version' is already present
-        if "api-version" not in original_url.params and api_version:
-            # Add api_version to optional_params
-            original_url.params["api-version"] = api_version
+        # Create a new dictionary with existing params
+        query_params = dict(original_url.params)
+
+        # Add api_version if needed
+        if "api-version" not in query_params and api_version:
+            query_params["api-version"] = api_version

        # Add the path to the base URL
        if "services.ai.azure.com" in api_base:
@ -96,8 +112,7 @@ class AzureAIStudioConfig(OpenAIConfig):
                api_base=api_base, ending_path="/chat/completions"
            )

-        # Convert optional_params to query parameters
-        query_params = original_url.params
+        # Use the new query_params dictionary
        final_url = httpx.URL(new_url).copy_with(params=query_params)

        return str(final_url)
--- a/litellm/llms/base_llm/anthropic_messages/transformation.py
+++ b/litellm/llms/base_llm/anthropic_messages/transformation.py
@ -0,0 +1,35 @@
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Any, Optional
+
+if TYPE_CHECKING:
+    from litellm.litellm_core_utils.litellm_logging import Logging as _LiteLLMLoggingObj
+
+    LiteLLMLoggingObj = _LiteLLMLoggingObj
+else:
+    LiteLLMLoggingObj = Any
+
+
+class BaseAnthropicMessagesConfig(ABC):
+    @abstractmethod
+    def validate_environment(
+        self,
+        headers: dict,
+        model: str,
+        api_key: Optional[str] = None,
+    ) -> dict:
+        pass
+
+    @abstractmethod
+    def get_complete_url(self, api_base: Optional[str], model: str) -> str:
+        """
+        OPTIONAL
+
+        Get the complete url for the request
+
+        Some providers need `model` in `api_base`
+        """
+        return api_base or ""
+
+    @abstractmethod
+    def get_supported_anthropic_messages_params(self, model: str) -> list:
+        pass
--- a/litellm/llms/base_llm/audio_transcription/transformation.py
+++ b/litellm/llms/base_llm/audio_transcription/transformation.py
@ -30,6 +30,7 @@ class BaseAudioTranscriptionConfig(BaseConfig, ABC):
        api_base: Optional[str],
        model: str,
        optional_params: dict,
+        litellm_params: dict,
        stream: Optional[bool] = None,
    ) -> str:
        """
--- a/litellm/llms/base_llm/chat/transformation.py
+++ b/litellm/llms/base_llm/chat/transformation.py
@ -51,6 +51,7 @@ class BaseLLMException(Exception):
        headers: Optional[Union[dict, httpx.Headers]] = None,
        request: Optional[httpx.Request] = None,
        response: Optional[httpx.Response] = None,
+        body: Optional[dict] = None,
    ):
        self.status_code = status_code
        self.message: str = message
@ -67,6 +68,7 @@ class BaseLLMException(Exception):
            self.response = httpx.Response(
                status_code=status_code, request=self.request
            )
+        self.body = body
        super().__init__(
            self.message
        )  # Call the base class constructor with the parameters it needs
@ -268,6 +270,7 @@ class BaseConfig(ABC):
        api_base: Optional[str],
        model: str,
        optional_params: dict,
+        litellm_params: dict,
        stream: Optional[bool] = None,
    ) -> str:
        """
--- a/litellm/llms/base_llm/completion/transformation.py
+++ b/litellm/llms/base_llm/completion/transformation.py
@ -31,6 +31,7 @@ class BaseTextCompletionConfig(BaseConfig, ABC):
        api_base: Optional[str],
        model: str,
        optional_params: dict,
+        litellm_params: dict,
        stream: Optional[bool] = None,
    ) -> str:
        """
--- a/litellm/llms/base_llm/embedding/transformation.py
+++ b/litellm/llms/base_llm/embedding/transformation.py
@ -45,6 +45,7 @@ class BaseEmbeddingConfig(BaseConfig, ABC):
        api_base: Optional[str],
        model: str,
        optional_params: dict,
+        litellm_params: dict,
        stream: Optional[bool] = None,
    ) -> str:
        """
--- a/litellm/llms/base_llm/image_variations/transformation.py
+++ b/litellm/llms/base_llm/image_variations/transformation.py
@ -36,6 +36,7 @@ class BaseImageVariationConfig(BaseConfig, ABC):
        api_base: Optional[str],
        model: str,
        optional_params: dict,
+        litellm_params: dict,
        stream: Optional[bool] = None,
    ) -> str:
        """
--- a/litellm/llms/base_llm/responses/transformation.py
+++ b/litellm/llms/base_llm/responses/transformation.py
@ -0,0 +1,133 @@
+import types
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Any, Dict, Optional, Union
+
+import httpx
+
+from litellm.types.llms.openai import (
+    ResponseInputParam,
+    ResponsesAPIOptionalRequestParams,
+    ResponsesAPIRequestParams,
+    ResponsesAPIResponse,
+    ResponsesAPIStreamingResponse,
+)
+from litellm.types.router import GenericLiteLLMParams
+
+if TYPE_CHECKING:
+    from litellm.litellm_core_utils.litellm_logging import Logging as _LiteLLMLoggingObj
+
+    from ..chat.transformation import BaseLLMException as _BaseLLMException
+
+    LiteLLMLoggingObj = _LiteLLMLoggingObj
+    BaseLLMException = _BaseLLMException
+else:
+    LiteLLMLoggingObj = Any
+    BaseLLMException = Any
+
+
+class BaseResponsesAPIConfig(ABC):
+    def __init__(self):
+        pass
+
+    @classmethod
+    def get_config(cls):
+        return {
+            k: v
+            for k, v in cls.__dict__.items()
+            if not k.startswith("__")
+            and not k.startswith("_abc")
+            and not isinstance(
+                v,
+                (
+                    types.FunctionType,
+                    types.BuiltinFunctionType,
+                    classmethod,
+                    staticmethod,
+                ),
+            )
+            and v is not None
+        }
+
+    @abstractmethod
+    def get_supported_openai_params(self, model: str) -> list:
+        pass
+
+    @abstractmethod
+    def map_openai_params(
+        self,
+        response_api_optional_params: ResponsesAPIOptionalRequestParams,
+        model: str,
+        drop_params: bool,
+    ) -> Dict:
+
+        pass
+
+    @abstractmethod
+    def validate_environment(
+        self,
+        headers: dict,
+        model: str,
+        api_key: Optional[str] = None,
+    ) -> dict:
+        return {}
+
+    @abstractmethod
+    def get_complete_url(
+        self,
+        api_base: Optional[str],
+        model: str,
+        stream: Optional[bool] = None,
+    ) -> str:
+        """
+        OPTIONAL
+
+        Get the complete url for the request
+
+        Some providers need `model` in `api_base`
+        """
+        if api_base is None:
+            raise ValueError("api_base is required")
+        return api_base
+
+    @abstractmethod
+    def transform_responses_api_request(
+        self,
+        model: str,
+        input: Union[str, ResponseInputParam],
+        response_api_optional_request_params: Dict,
+        litellm_params: GenericLiteLLMParams,
+        headers: dict,
+    ) -> ResponsesAPIRequestParams:
+        pass
+
+    @abstractmethod
+    def transform_response_api_response(
+        self,
+        model: str,
+        raw_response: httpx.Response,
+        logging_obj: LiteLLMLoggingObj,
+    ) -> ResponsesAPIResponse:
+        pass
+
+    @abstractmethod
+    def transform_streaming_response(
+        self,
+        model: str,
+        parsed_chunk: dict,
+        logging_obj: LiteLLMLoggingObj,
+    ) -> ResponsesAPIStreamingResponse:
+        """
+        Transform a parsed streaming response chunk into a ResponsesAPIStreamingResponse
+        """
+        pass
+
+    def get_error_class(
+        self, error_message: str, status_code: int, headers: Union[dict, httpx.Headers]
+    ) -> BaseLLMException:
+        from ..chat.transformation import BaseLLMException
+
+        raise BaseLLMException(
+            status_code=status_code,
+            message=error_message,
+            headers=headers,
+        )
--- a/litellm/llms/bedrock/base_aws_llm.py
+++ b/litellm/llms/bedrock/base_aws_llm.py
@ -554,6 +554,7 @@ class BaseAWSLLM:
        aws_access_key_id = optional_params.pop("aws_access_key_id", None)
        aws_session_token = optional_params.pop("aws_session_token", None)
        aws_region_name = self._get_aws_region_name(optional_params, model)
+        optional_params.pop("aws_region_name", None)
        aws_role_name = optional_params.pop("aws_role_name", None)
        aws_session_name = optional_params.pop("aws_session_name", None)
        aws_profile_name = optional_params.pop("aws_profile_name", None)
--- a/litellm/llms/bedrock/chat/converse_transformation.py
+++ b/litellm/llms/bedrock/chat/converse_transformation.py
@ -272,7 +272,7 @@ class AmazonConverseConfig(BaseConfig):
                optional_params["temperature"] = value
            if param == "top_p":
                optional_params["topP"] = value
-            if param == "tools":
+            if param == "tools" and isinstance(value, list):
                optional_params = self._add_tools_to_optional_params(
                    optional_params=optional_params, tools=value
                )
@ -598,7 +598,7 @@ class AmazonConverseConfig(BaseConfig):
                if _text is not None:
                    _thinking_block["thinking"] = _text
                if _signature is not None:
-                    _thinking_block["signature_delta"] = _signature
+                    _thinking_block["signature"] = _signature
                thinking_blocks_list.append(_thinking_block)
        return thinking_blocks_list

--- a/Show more
+++ b/Show more