Merge branch 'main' into litellm_dev_03_12_2025_p1

2025-04-26 11:14:04 +00:00 · 2025-03-12 22:14:02 -07:00 · 2025-03-12 22:14:02 -07:00 · 72f92853e0
commit 72f92853e0
parent 2f62ac6992 fd8e364d4b
111 changed files with 7304 additions and 2714 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -49,7 +49,7 @@ jobs:
            pip install opentelemetry-api==1.25.0
            pip install opentelemetry-sdk==1.25.0
            pip install opentelemetry-exporter-otlp==1.25.0
-            pip install openai==1.54.0
+            pip install openai==1.66.1
            pip install prisma==0.11.0
            pip install "detect_secrets==1.5.0"
            pip install "httpx==0.24.1"
@ -168,7 +168,7 @@ jobs:
            pip install opentelemetry-api==1.25.0
            pip install opentelemetry-sdk==1.25.0
            pip install opentelemetry-exporter-otlp==1.25.0
-            pip install openai==1.54.0
+            pip install openai==1.66.1
            pip install prisma==0.11.0
            pip install "detect_secrets==1.5.0"
            pip install "httpx==0.24.1"
@ -267,7 +267,7 @@ jobs:
            pip install opentelemetry-api==1.25.0
            pip install opentelemetry-sdk==1.25.0
            pip install opentelemetry-exporter-otlp==1.25.0
-            pip install openai==1.54.0
+            pip install openai==1.66.1
            pip install prisma==0.11.0
            pip install "detect_secrets==1.5.0"
            pip install "httpx==0.24.1"
@ -511,7 +511,7 @@ jobs:
            pip install opentelemetry-api==1.25.0
            pip install opentelemetry-sdk==1.25.0
            pip install opentelemetry-exporter-otlp==1.25.0
-            pip install openai==1.54.0
+            pip install openai==1.66.1
            pip install prisma==0.11.0
            pip install "detect_secrets==1.5.0"
            pip install "httpx==0.24.1"
@ -678,6 +678,48 @@ jobs:
          paths:
            - llm_translation_coverage.xml
            - llm_translation_coverage
  llm_responses_api_testing:
    docker:
      - image: cimg/python:3.11
        auth:
          username: ${DOCKERHUB_USERNAME}
          password: ${DOCKERHUB_PASSWORD}
    working_directory: ~/project
    steps:
      - checkout
      - run:
          name: Install Dependencies
          command: |
            python -m pip install --upgrade pip
            python -m pip install -r requirements.txt
            pip install "pytest==7.3.1"
            pip install "pytest-retry==1.6.3"
            pip install "pytest-cov==5.0.0"
            pip install "pytest-asyncio==0.21.1"
            pip install "respx==0.21.1"
      # Run pytest and generate JUnit XML report
      - run:
          name: Run tests
          command: |
            pwd
            ls
            python -m pytest -vv tests/llm_responses_api_testing --cov=litellm --cov-report=xml -x -s -v --junitxml=test-results/junit.xml --durations=5
          no_output_timeout: 120m
      - run:
          name: Rename the coverage files
          command: |
            mv coverage.xml llm_responses_api_coverage.xml
            mv .coverage llm_responses_api_coverage
      # Store test results
      - store_test_results:
          path: test-results
      - persist_to_workspace:
          root: .
          paths:
            - llm_responses_api_coverage.xml
            - llm_responses_api_coverage
  litellm_mapped_tests:
    docker:
      - image: cimg/python:3.11
@ -1234,7 +1276,7 @@ jobs:
            pip install "aiodynamo==23.10.1"
            pip install "asyncio==3.4.3"
            pip install "PyGithub==1.59.1"
-            pip install "openai==1.54.0 "
+            pip install "openai==1.66.1"
      - run:
          name: Install Grype
          command: |
@ -1309,7 +1351,7 @@ jobs:
          command: |
            pwd
            ls
-            python -m pytest -s -vv tests/*.py -x --junitxml=test-results/junit.xml --durations=5 --ignore=tests/otel_tests --ignore=tests/pass_through_tests --ignore=tests/proxy_admin_ui_tests --ignore=tests/load_tests --ignore=tests/llm_translation --ignore=tests/image_gen_tests --ignore=tests/pass_through_unit_tests
+            python -m pytest -s -vv tests/*.py -x --junitxml=test-results/junit.xml --durations=5 --ignore=tests/otel_tests --ignore=tests/pass_through_tests --ignore=tests/proxy_admin_ui_tests --ignore=tests/load_tests --ignore=tests/llm_translation --ignore=tests/llm_responses_api_testing --ignore=tests/image_gen_tests --ignore=tests/pass_through_unit_tests
          no_output_timeout: 120m
      # Store test results
@ -1370,7 +1412,7 @@ jobs:
            pip install "aiodynamo==23.10.1"
            pip install "asyncio==3.4.3"
            pip install "PyGithub==1.59.1"
-            pip install "openai==1.54.0 "
+            pip install "openai==1.66.1"
            # Run pytest and generate JUnit XML report
      - run:
          name: Build Docker image
@ -1492,7 +1534,7 @@ jobs:
            pip install "aiodynamo==23.10.1"
            pip install "asyncio==3.4.3"
            pip install "PyGithub==1.59.1"
-            pip install "openai==1.54.0 "
+            pip install "openai==1.66.1"
      - run:
          name: Build Docker image
          command: docker build -t my-app:latest -f ./docker/Dockerfile.database .
@ -1921,7 +1963,7 @@ jobs:
            pip install "pytest-asyncio==0.21.1"
            pip install "google-cloud-aiplatform==1.43.0"
            pip install aiohttp
-            pip install "openai==1.54.0 "
+            pip install "openai==1.66.1"
            pip install "assemblyai==0.37.0"
            python -m pip install --upgrade pip
            pip install "pydantic==2.7.1"
@ -2068,7 +2110,7 @@ jobs:
            python -m venv venv
            . venv/bin/activate
            pip install coverage
-            coverage combine llm_translation_coverage logging_coverage litellm_router_coverage local_testing_coverage litellm_assistants_api_coverage auth_ui_unit_tests_coverage langfuse_coverage caching_coverage litellm_proxy_unit_tests_coverage image_gen_coverage pass_through_unit_tests_coverage batches_coverage litellm_proxy_security_tests_coverage
+            coverage combine llm_translation_coverage llm_responses_api_coverage logging_coverage litellm_router_coverage local_testing_coverage litellm_assistants_api_coverage auth_ui_unit_tests_coverage langfuse_coverage caching_coverage litellm_proxy_unit_tests_coverage image_gen_coverage pass_through_unit_tests_coverage batches_coverage litellm_proxy_security_tests_coverage
            coverage xml
      - codecov/upload:
          file: ./coverage.xml
@ -2197,7 +2239,7 @@ jobs:
            pip install "pytest-retry==1.6.3"
            pip install "pytest-asyncio==0.21.1"
            pip install aiohttp
-            pip install "openai==1.54.0 "
+            pip install "openai==1.66.1"
            python -m pip install --upgrade pip
            pip install "pydantic==2.7.1"
            pip install "pytest==7.3.1"
@ -2429,6 +2471,12 @@ workflows:
              only:
                - main
                - /litellm_.*/
      - llm_responses_api_testing:
          filters:
            branches:
              only:
                - main
                - /litellm_.*/
      - litellm_mapped_tests:
          filters:
            branches:
@ -2468,6 +2516,7 @@ workflows:
      - upload-coverage:
          requires:
            - llm_translation_testing
            - llm_responses_api_testing
            - litellm_mapped_tests
            - batches_testing
            - litellm_utils_testing
@ -2526,6 +2575,7 @@ workflows:
            - load_testing
            - test_bad_database_url
            - llm_translation_testing
            - llm_responses_api_testing
            - litellm_mapped_tests
            - batches_testing
            - litellm_utils_testing
--- a/.circleci/requirements.txt
+++ b/.circleci/requirements.txt
@ -1,5 +1,5 @@
 # used by CI/CD testing
-openai==1.54.0 
+openai==1.66.1
 python-dotenv
 tiktoken
 importlib_metadata
--- a/docs/my-website/docs/anthropic_unified.md
+++ b/docs/my-website/docs/anthropic_unified.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
-# [BETA] `/v1/messages`
+# /v1/messages [BETA] 
 LiteLLM provides a BETA endpoint in the spec of Anthropic's `/v1/messages` endpoint. 
--- a/docs/my-website/docs/assistants.md
+++ b/docs/my-website/docs/assistants.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
-# Assistants API 
+# /assistants
 Covers Threads, Messages, Assistants. 
--- a/docs/my-website/docs/batches.md
+++ b/docs/my-website/docs/batches.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
-# [BETA] Batches API
+# /batches
 Covers Batches, Files
--- a/docs/my-website/docs/embedding/supported_embedding.md
+++ b/docs/my-website/docs/embedding/supported_embedding.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
-# Embeddings
+# /embeddings
 ## Quick Start
 ```python
--- a/docs/my-website/docs/files_endpoints.md
+++ b/docs/my-website/docs/files_endpoints.md
@ -2,7 +2,7 @@
 import TabItem from '@theme/TabItem';
 import Tabs from '@theme/Tabs';
-# Files API
+# /files
 Files are used to upload documents that can be used with features like Assistants, Fine-tuning, and Batch API.
--- a/docs/my-website/docs/fine_tuning.md
+++ b/docs/my-website/docs/fine_tuning.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
-# [Beta] Fine-tuning API
+# /fine_tuning
 :::info
--- a/docs/my-website/docs/moderation.md
+++ b/docs/my-website/docs/moderation.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
-# Moderation
+# /moderations
 ### Usage
--- a/docs/my-website/docs/realtime.md
+++ b/docs/my-website/docs/realtime.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
-# Realtime Endpoints
+# /realtime
 Use this to loadbalance across Azure + OpenAI. 
--- a/docs/my-website/docs/rerank.md
+++ b/docs/my-website/docs/rerank.md
@ -1,4 +1,4 @@
-# Rerank
+# /rerank
 :::tip
--- a/docs/my-website/docs/response_api.md
+++ b/docs/my-website/docs/response_api.md
@ -0,0 +1,117 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # /responses
 LiteLLM provides a BETA endpoint in the spec of [OpenAI's `/responses` API](https://platform.openai.com/docs/api-reference/responses)
 | Feature | Supported | Notes |
 |---------|-----------|--------|
 | Cost Tracking | ✅ | Works with all supported models |
 | Logging | ✅ | Works across all integrations |
 | End-user Tracking | ✅ | |
 | Streaming | ✅ | |
 | Fallbacks | ✅ | Works between supported models |
 | Loadbalancing | ✅ | Works between supported models |
 | Supported LiteLLM Versions | 1.63.8+ | |
 | Supported LLM providers | `openai` | |
 ## Usage
 ## Create a model response
 <Tabs>
 <TabItem value="litellm-sdk" label="LiteLLM SDK">
 #### Non-streaming
 ```python
 import litellm
 # Non-streaming response
 response = litellm.responses(
    model="gpt-4o",
    input="Tell me a three sentence bedtime story about a unicorn.",
    max_output_tokens=100
 )
 print(response)
 ```
 #### Streaming
 ```python
 import litellm
 # Streaming response
 response = litellm.responses(
    model="gpt-4o",
    input="Tell me a three sentence bedtime story about a unicorn.",
    stream=True
 )
 for event in response:
    print(event)
 ```
 </TabItem>
 <TabItem value="proxy" label="OpenAI SDK with LiteLLM Proxy">
 First, add this to your litellm proxy config.yaml:
 ```yaml
 model_list:
  - model_name: gpt-4o
    litellm_params:
      model: openai/gpt-4
      api_key: os.environ/OPENAI_API_KEY
 ```
 Start your LiteLLM proxy:
 ```bash
 litellm --config /path/to/config.yaml
 # RUNNING on http://0.0.0.0:4000
 ```
 Then use the OpenAI SDK pointed to your proxy:
 #### Non-streaming
 ```python
 from openai import OpenAI
 # Initialize client with your proxy URL
 client = OpenAI(
    base_url="http://localhost:4000",  # Your proxy URL
    api_key="your-api-key"             # Your proxy API key
 )
 # Non-streaming response
 response = client.responses.create(
    model="gpt-4o",
    input="Tell me a three sentence bedtime story about a unicorn."
 )
 print(response)
 ```
 #### Streaming
 ```python
 from openai import OpenAI
 # Initialize client with your proxy URL
 client = OpenAI(
    base_url="http://localhost:4000",  # Your proxy URL
    api_key="your-api-key"             # Your proxy API key
 )
 # Streaming response
 response = client.responses.create(
    model="gpt-4o",
    input="Tell me a three sentence bedtime story about a unicorn.",
    stream=True
 )
 for event in response:
    print(event)
 ```
 </TabItem>
 </Tabs>
--- a/docs/my-website/docs/text_completion.md
+++ b/docs/my-website/docs/text_completion.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
-# Text Completion
+# /completions
 ### Usage
 <Tabs>
--- a/docs/my-website/package-lock.json
+++ b/docs/my-website/package-lock.json
@ -706,12 +706,13 @@
      }
    },
    "node_modules/@babel/helpers": {
-      "version": "7.26.0",
+      "version": "7.26.10",
-      "resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.26.0.tgz",
+      "resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.26.10.tgz",
-      "integrity": "sha512-tbhNuIxNcVb21pInl3ZSjksLCvgdZy9KwJ8brv993QtIVKJBBkYXz4q4ZbAv31GdnC+R90np23L5FbEBlthAEw==",
+      "integrity": "sha512-UPYc3SauzZ3JGgj87GgZ89JVdC5dj0AoetR5Bw6wj4niittNyFh6+eOGonYvJ1ao6B8lEa3Q3klS7ADZ53bc5g==",
      "license": "MIT",
      "dependencies": {
-        "@babel/template": "^7.25.9",
+        "@babel/template": "^7.26.9",
-        "@babel/types": "^7.26.0"
+        "@babel/types": "^7.26.10"
      },
      "engines": {
        "node": ">=6.9.0"
@ -796,11 +797,12 @@
      }
    },
    "node_modules/@babel/parser": {
-      "version": "7.26.3",
+      "version": "7.26.10",
-      "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.26.3.tgz",
+      "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.26.10.tgz",
-      "integrity": "sha512-WJ/CvmY8Mea8iDXo6a7RK2wbmJITT5fN3BEkRuFlxVyNx8jOKIIhmC4fSkTcPcf8JyavbBwIe6OpiCOBXt/IcA==",
+      "integrity": "sha512-6aQR2zGE/QFi8JpDLjUZEPYOs7+mhKXm86VaKFiLP35JQwQb6bwUE+XbvkH0EptsYhbNBSUGaUBLKqxH1xSgsA==",
      "license": "MIT",
      "dependencies": {
-        "@babel/types": "^7.26.3"
+        "@babel/types": "^7.26.10"
      },
      "bin": {
        "parser": "bin/babel-parser.js"
@ -2157,9 +2159,10 @@
      }
    },
    "node_modules/@babel/runtime-corejs3": {
-      "version": "7.26.0",
+      "version": "7.26.10",
-      "resolved": "https://registry.npmjs.org/@babel/runtime-corejs3/-/runtime-corejs3-7.26.0.tgz",
+      "resolved": "https://registry.npmjs.org/@babel/runtime-corejs3/-/runtime-corejs3-7.26.10.tgz",
-      "integrity": "sha512-YXHu5lN8kJCb1LOb9PgV6pvak43X2h4HvRApcN5SdWeaItQOzfn1hgP6jasD6KWQyJDBxrVmA9o9OivlnNJK/w==",
+      "integrity": "sha512-uITFQYO68pMEYR46AHgQoyBg7KPPJDAbGn4jUTIRgCFJIp88MIBUianVOplhZDEec07bp9zIyr4Kp0FCyQzmWg==",
      "license": "MIT",
      "dependencies": {
        "core-js-pure": "^3.30.2",
        "regenerator-runtime": "^0.14.0"
@ -2169,13 +2172,14 @@
      }
    },
    "node_modules/@babel/template": {
-      "version": "7.25.9",
+      "version": "7.26.9",
-      "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.25.9.tgz",
+      "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.26.9.tgz",
-      "integrity": "sha512-9DGttpmPvIxBb/2uwpVo3dqJ+O6RooAFOS+lB+xDqoE2PVCE8nfoHMdZLpfCQRLwvohzXISPZcgxt80xLfsuwg==",
+      "integrity": "sha512-qyRplbeIpNZhmzOysF/wFMuP9sctmh2cFzRAZOn1YapxBsE1i9bJIY586R/WBLfLcmcBlM8ROBiQURnnNy+zfA==",
      "license": "MIT",
      "dependencies": {
-        "@babel/code-frame": "^7.25.9",
+        "@babel/code-frame": "^7.26.2",
-        "@babel/parser": "^7.25.9",
+        "@babel/parser": "^7.26.9",
-        "@babel/types": "^7.25.9"
+        "@babel/types": "^7.26.9"
      },
      "engines": {
        "node": ">=6.9.0"
@ -2199,9 +2203,10 @@
      }
    },
    "node_modules/@babel/types": {
-      "version": "7.26.3",
+      "version": "7.26.10",
-      "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.26.3.tgz",
+      "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.26.10.tgz",
-      "integrity": "sha512-vN5p+1kl59GVKMvTHt55NzzmYVxprfJD+ql7U9NFIfKCBkYE55LYtS+WtPlaYOyzydrKI8Nezd+aZextrd+FMA==",
+      "integrity": "sha512-emqcG3vHrpxUKTrxcblR36dcrcoRDvKmnL/dCL6ZsHaShW80qxCAcNhzQZrpeM765VzEos+xOi4s+r4IXzTwdQ==",
      "license": "MIT",
      "dependencies": {
        "@babel/helper-string-parser": "^7.25.9",
        "@babel/helper-validator-identifier": "^7.25.9"
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -273,7 +273,7 @@ const sidebars = {
      items: [
        {
          type: "category",
-          label: "Chat",
+          label: "/chat/completions",
          link: {
            type: "generated-index",
            title: "Chat Completions",
@ -286,12 +286,13 @@ const sidebars = {
            "completion/usage",
          ],
        },
        "response_api",
        "text_completion",
        "embedding/supported_embedding",
        "anthropic_unified",
        {
          type: "category",
-          label: "Image",
+          label: "/images",
          items: [
            "image_generation",
            "image_variations",
@ -299,7 +300,7 @@ const sidebars = {
        },
        {
          type: "category",
-          label: "Audio",
+          label: "/audio",
          "items": [
            "audio_transcription",
            "text_to_speech",
--- a/enterprise/enterprise_hooks/aporia_ai.py
+++ b/enterprise/enterprise_hooks/aporia_ai.py
@ -163,7 +163,7 @@ class AporiaGuardrail(CustomGuardrail):
        pass
-    async def async_moderation_hook(  ### 👈 KEY CHANGE ###
+    async def async_moderation_hook(
        self,
        data: dict,
        user_api_key_dict: UserAPIKeyAuth,
@ -173,6 +173,7 @@ class AporiaGuardrail(CustomGuardrail):
            "image_generation",
            "moderation",
            "audio_transcription",
            "responses",
        ],
    ):
        from litellm.proxy.common_utils.callback_utils import (
--- a/enterprise/enterprise_hooks/google_text_moderation.py
+++ b/enterprise/enterprise_hooks/google_text_moderation.py
@ -94,6 +94,7 @@ class _ENTERPRISE_GoogleTextModeration(CustomLogger):
            "image_generation",
            "moderation",
            "audio_transcription",
            "responses",
        ],
    ):
        """
--- a/enterprise/enterprise_hooks/llama_guard.py
+++ b/enterprise/enterprise_hooks/llama_guard.py
@ -107,6 +107,7 @@ class _ENTERPRISE_LlamaGuard(CustomLogger):
            "image_generation",
            "moderation",
            "audio_transcription",
            "responses",
        ],
    ):
        """
--- a/enterprise/enterprise_hooks/llm_guard.py
+++ b/enterprise/enterprise_hooks/llm_guard.py
@ -126,6 +126,7 @@ class _ENTERPRISE_LLMGuard(CustomLogger):
            "image_generation",
            "moderation",
            "audio_transcription",
            "responses",
        ],
    ):
        """
--- a/enterprise/enterprise_hooks/openai_moderation.py
+++ b/enterprise/enterprise_hooks/openai_moderation.py
@ -31,7 +31,7 @@ class _ENTERPRISE_OpenAI_Moderation(CustomLogger):
    #### CALL HOOKS - proxy only ####
-    async def async_moderation_hook(  ### 👈 KEY CHANGE ###
+    async def async_moderation_hook(
        self,
        data: dict,
        user_api_key_dict: UserAPIKeyAuth,
@ -41,6 +41,7 @@ class _ENTERPRISE_OpenAI_Moderation(CustomLogger):
            "image_generation",
            "moderation",
            "audio_transcription",
            "responses",
        ],
    ):
        text = ""
--- a/litellm/init.py
+++ b/litellm/init.py
@ -8,12 +8,14 @@ import os
 from typing import Callable, List, Optional, Dict, Union, Any, Literal, get_args
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.caching.caching import Cache, DualCache, RedisCache, InMemoryCache
 from litellm.caching.llm_caching_handler import LLMClientCache
 from litellm.types.llms.bedrock import COHERE_EMBEDDING_INPUT_TYPES
 from litellm.types.utils import (
    ImageObject,
    BudgetConfig,
    all_litellm_params,
    all_litellm_params as _litellm_completion_params,
    CredentialItem,
 )  # maintain backwards compatibility for root param
 from litellm._logging import (
    set_verbose,
@ -189,15 +191,17 @@ ssl_verify: Union[str, bool] = True
 ssl_certificate: Optional[str] = None
 disable_streaming_logging: bool = False
 disable_add_transform_inline_image_block: bool = False
-in_memory_llm_clients_cache: InMemoryCache = InMemoryCache()
+in_memory_llm_clients_cache: LLMClientCache = LLMClientCache()
 safe_memory_mode: bool = False
 enable_azure_ad_token_refresh: Optional[bool] = False
 ### DEFAULT AZURE API VERSION ###
-AZURE_DEFAULT_API_VERSION = "2024-08-01-preview"  # this is updated to the latest
+AZURE_DEFAULT_API_VERSION = "2025-02-01-preview"  # this is updated to the latest
 ### DEFAULT WATSONX API VERSION ###
 WATSONX_DEFAULT_API_VERSION = "2024-03-13"
 ### COHERE EMBEDDINGS DEFAULT TYPE ###
 COHERE_DEFAULT_EMBEDDING_INPUT_TYPE: COHERE_EMBEDDING_INPUT_TYPES = "search_document"
 ### CREDENTIALS ###
 credential_list: List[CredentialItem] = []
 ### GUARDRAILS ###
 llamaguard_model_name: Optional[str] = None
 openai_moderations_model_name: Optional[str] = None
@ -922,6 +926,7 @@ from .llms.groq.chat.transformation import GroqChatConfig
 from .llms.voyage.embedding.transformation import VoyageEmbeddingConfig
 from .llms.azure_ai.chat.transformation import AzureAIStudioConfig
 from .llms.mistral.mistral_chat_transformation import MistralConfig
 from .llms.openai.responses.transformation import OpenAIResponsesAPIConfig
 from .llms.openai.chat.o_series_transformation import (
    OpenAIOSeriesConfig as OpenAIO1Config,  # maintain backwards compatibility
    OpenAIOSeriesConfig,
@ -1011,6 +1016,7 @@ from .batches.main import *
 from .batch_completion.main import *  # type: ignore
 from .rerank_api.main import *
 from .llms.anthropic.experimental_pass_through.messages.handler import *
 from .responses.main import *
 from .realtime_api.main import _arealtime
 from .fine_tuning.main import *
 from .files.main import *
--- a/litellm/assistants/main.py
+++ b/litellm/assistants/main.py
@ -15,6 +15,7 @@ import litellm
 from litellm.types.router import GenericLiteLLMParams
 from litellm.utils import (
    exception_type,
    get_litellm_params,
    get_llm_provider,
    get_secret,
    supports_httpx_timeout,
@ -86,6 +87,7 @@ def get_assistants(
    optional_params = GenericLiteLLMParams(
        api_key=api_key, api_base=api_base, api_version=api_version, **kwargs
    )
    litellm_params_dict = get_litellm_params(**kwargs)
    ### TIMEOUT LOGIC ###
    timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
@ -169,6 +171,7 @@ def get_assistants(
            max_retries=optional_params.max_retries,
            client=client,
            aget_assistants=aget_assistants,  # type: ignore
            litellm_params=litellm_params_dict,
        )
    else:
        raise litellm.exceptions.BadRequestError(
@ -270,6 +273,7 @@ def create_assistants(
    optional_params = GenericLiteLLMParams(
        api_key=api_key, api_base=api_base, api_version=api_version, **kwargs
    )
    litellm_params_dict = get_litellm_params(**kwargs)
    ### TIMEOUT LOGIC ###
    timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
@ -371,6 +375,7 @@ def create_assistants(
            client=client,
            async_create_assistants=async_create_assistants,
            create_assistant_data=create_assistant_data,
            litellm_params=litellm_params_dict,
        )
    else:
        raise litellm.exceptions.BadRequestError(
@ -445,6 +450,8 @@ def delete_assistant(
        api_key=api_key, api_base=api_base, api_version=api_version, **kwargs
    )
    litellm_params_dict = get_litellm_params(**kwargs)
    async_delete_assistants: Optional[bool] = kwargs.pop(
        "async_delete_assistants", None
    )
@ -544,6 +551,7 @@ def delete_assistant(
            max_retries=optional_params.max_retries,
            client=client,
            async_delete_assistants=async_delete_assistants,
            litellm_params=litellm_params_dict,
        )
    else:
        raise litellm.exceptions.BadRequestError(
@ -639,6 +647,7 @@ def create_thread(
    """
    acreate_thread = kwargs.get("acreate_thread", None)
    optional_params = GenericLiteLLMParams(**kwargs)
    litellm_params_dict = get_litellm_params(**kwargs)
    ### TIMEOUT LOGIC ###
    timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
@ -731,6 +740,7 @@ def create_thread(
            max_retries=optional_params.max_retries,
            client=client,
            acreate_thread=acreate_thread,
            litellm_params=litellm_params_dict,
        )
    else:
        raise litellm.exceptions.BadRequestError(
@ -795,7 +805,7 @@ def get_thread(
    """Get the thread object, given a thread_id"""
    aget_thread = kwargs.pop("aget_thread", None)
    optional_params = GenericLiteLLMParams(**kwargs)
-
+    litellm_params_dict = get_litellm_params(**kwargs)
    ### TIMEOUT LOGIC ###
    timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
    # set timeout for 10 minutes by default
@ -884,6 +894,7 @@ def get_thread(
            max_retries=optional_params.max_retries,
            client=client,
            aget_thread=aget_thread,
            litellm_params=litellm_params_dict,
        )
    else:
        raise litellm.exceptions.BadRequestError(
@ -972,6 +983,7 @@ def add_message(
    _message_data = MessageData(
        role=role, content=content, attachments=attachments, metadata=metadata
    )
    litellm_params_dict = get_litellm_params(**kwargs)
    optional_params = GenericLiteLLMParams(**kwargs)
    message_data = get_optional_params_add_message(
@ -1068,6 +1080,7 @@ def add_message(
            max_retries=optional_params.max_retries,
            client=client,
            a_add_message=a_add_message,
            litellm_params=litellm_params_dict,
        )
    else:
        raise litellm.exceptions.BadRequestError(
@ -1139,6 +1152,7 @@ def get_messages(
 ) -> SyncCursorPage[OpenAIMessage]:
    aget_messages = kwargs.pop("aget_messages", None)
    optional_params = GenericLiteLLMParams(**kwargs)
    litellm_params_dict = get_litellm_params(**kwargs)
    ### TIMEOUT LOGIC ###
    timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
@ -1225,6 +1239,7 @@ def get_messages(
            max_retries=optional_params.max_retries,
            client=client,
            aget_messages=aget_messages,
            litellm_params=litellm_params_dict,
        )
    else:
        raise litellm.exceptions.BadRequestError(
@ -1337,6 +1352,7 @@ def run_thread(
    """Run a given thread + assistant."""
    arun_thread = kwargs.pop("arun_thread", None)
    optional_params = GenericLiteLLMParams(**kwargs)
    litellm_params_dict = get_litellm_params(**kwargs)
    ### TIMEOUT LOGIC ###
    timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
@ -1437,6 +1453,7 @@ def run_thread(
            max_retries=optional_params.max_retries,
            client=client,
            arun_thread=arun_thread,
            litellm_params=litellm_params_dict,
        )  # type: ignore
    else:
        raise litellm.exceptions.BadRequestError(
--- a/litellm/batches/main.py
+++ b/litellm/batches/main.py
@ -111,6 +111,7 @@ def create_batch(
        proxy_server_request = kwargs.get("proxy_server_request", None)
        model_info = kwargs.get("model_info", None)
        _is_async = kwargs.pop("acreate_batch", False) is True
        litellm_params = get_litellm_params(**kwargs)
        litellm_logging_obj: LiteLLMLoggingObj = kwargs.get("litellm_logging_obj", None)
        ### TIMEOUT LOGIC ###
        timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
@ -217,6 +218,7 @@ def create_batch(
                timeout=timeout,
                max_retries=optional_params.max_retries,
                create_batch_data=_create_batch_request,
                litellm_params=litellm_params,
            )
        elif custom_llm_provider == "vertex_ai":
            api_base = optional_params.api_base or ""
@ -320,15 +322,12 @@ def retrieve_batch(
    """
    try:
        optional_params = GenericLiteLLMParams(**kwargs)
        litellm_logging_obj: LiteLLMLoggingObj = kwargs.get("litellm_logging_obj", None)
        ### TIMEOUT LOGIC ###
        timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
        litellm_params = get_litellm_params(
            custom_llm_provider=custom_llm_provider,
-            litellm_call_id=kwargs.get("litellm_call_id", None),
+            **kwargs,
            litellm_trace_id=kwargs.get("litellm_trace_id"),
            litellm_metadata=kwargs.get("litellm_metadata"),
        )
        litellm_logging_obj.update_environment_variables(
            model=None,
@ -424,6 +423,7 @@ def retrieve_batch(
                timeout=timeout,
                max_retries=optional_params.max_retries,
                retrieve_batch_data=_retrieve_batch_request,
                litellm_params=litellm_params,
            )
        elif custom_llm_provider == "vertex_ai":
            api_base = optional_params.api_base or ""
@ -526,6 +526,10 @@ def list_batches(
    try:
        # set API KEY
        optional_params = GenericLiteLLMParams(**kwargs)
        litellm_params = get_litellm_params(
            custom_llm_provider=custom_llm_provider,
            **kwargs,
        )
        api_key = (
            optional_params.api_key
            or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
@ -603,6 +607,7 @@ def list_batches(
                api_version=api_version,
                timeout=timeout,
                max_retries=optional_params.max_retries,
                litellm_params=litellm_params,
            )
        else:
            raise litellm.exceptions.BadRequestError(
@ -678,6 +683,10 @@ def cancel_batch(
    """
    try:
        optional_params = GenericLiteLLMParams(**kwargs)
        litellm_params = get_litellm_params(
            custom_llm_provider=custom_llm_provider,
            **kwargs,
        )
        ### TIMEOUT LOGIC ###
        timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
        # set timeout for 10 minutes by default
@ -765,6 +774,7 @@ def cancel_batch(
                timeout=timeout,
                max_retries=optional_params.max_retries,
                cancel_batch_data=_cancel_batch_request,
                litellm_params=litellm_params,
            )
        else:
            raise litellm.exceptions.BadRequestError(
--- a/litellm/caching/llm_caching_handler.py
+++ b/litellm/caching/llm_caching_handler.py
@ -0,0 +1,40 @@
 """
 Add the event loop to the cache key, to prevent event loop closed errors.
 """
 import asyncio
 from .in_memory_cache import InMemoryCache
 class LLMClientCache(InMemoryCache):
    def update_cache_key_with_event_loop(self, key):
        """
        Add the event loop to the cache key, to prevent event loop closed errors.
        If none, use the key as is.
        """
        try:
            event_loop = asyncio.get_event_loop()
            stringified_event_loop = str(id(event_loop))
            return f"{key}-{stringified_event_loop}"
        except Exception:  # handle no current event loop
            return key
    def set_cache(self, key, value, **kwargs):
        key = self.update_cache_key_with_event_loop(key)
        return super().set_cache(key, value, **kwargs)
    async def async_set_cache(self, key, value, **kwargs):
        key = self.update_cache_key_with_event_loop(key)
        return await super().async_set_cache(key, value, **kwargs)
    def get_cache(self, key, **kwargs):
        key = self.update_cache_key_with_event_loop(key)
        return super().get_cache(key, **kwargs)
    async def async_get_cache(self, key, **kwargs):
        key = self.update_cache_key_with_event_loop(key)
        return await super().async_get_cache(key, **kwargs)
--- a/litellm/constants.py
+++ b/litellm/constants.py
@ -18,6 +18,7 @@ SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000  # Minimum number of requests
 REPEATED_STREAMING_CHUNK_LIMIT = 100  # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.
 #### Networking settings ####
 request_timeout: float = 6000  # time in seconds
 STREAM_SSE_DONE_STRING: str = "[DONE]"
 LITELLM_CHAT_PROVIDERS = [
    "openai",
--- a/litellm/cost_calculator.py
+++ b/litellm/cost_calculator.py
@ -44,7 +44,12 @@ from litellm.llms.vertex_ai.cost_calculator import cost_router as google_cost_ro
 from litellm.llms.vertex_ai.image_generation.cost_calculator import (
    cost_calculator as vertex_ai_image_cost_calculator,
 )
-from litellm.types.llms.openai import HttpxBinaryResponseContent
+from litellm.responses.utils import ResponseAPILoggingUtils
 from litellm.types.llms.openai import (
    HttpxBinaryResponseContent,
    ResponseAPIUsage,
    ResponsesAPIResponse,
 )
 from litellm.types.rerank import RerankBilledUnits, RerankResponse
 from litellm.types.utils import (
    CallTypesLiteral,
@ -464,6 +469,13 @@ def _get_usage_object(
    return usage_obj
 def _is_known_usage_objects(usage_obj):
    """Returns True if the usage obj is a known Usage type"""
    return isinstance(usage_obj, litellm.Usage) or isinstance(
        usage_obj, ResponseAPIUsage
    )
 def _infer_call_type(
    call_type: Optional[CallTypesLiteral], completion_response: Any
 ) -> Optional[CallTypesLiteral]:
@ -585,8 +597,8 @@ def completion_cost(  # noqa: PLR0915
                )
            else:
                usage_obj = getattr(completion_response, "usage", {})
-            if isinstance(usage_obj, BaseModel) and not isinstance(
+            if isinstance(usage_obj, BaseModel) and not _is_known_usage_objects(
-                usage_obj, litellm.Usage
+                usage_obj=usage_obj
            ):
                setattr(
                    completion_response,
@ -599,6 +611,14 @@ def completion_cost(  # noqa: PLR0915
                _usage = usage_obj.model_dump()
            else:
                _usage = usage_obj
            if ResponseAPILoggingUtils._is_response_api_usage(_usage):
                _usage = (
                    ResponseAPILoggingUtils._transform_response_api_usage_to_chat_usage(
                        _usage
                    ).model_dump()
                )
            # get input/output tokens from completion_response
            prompt_tokens = _usage.get("prompt_tokens", 0)
            completion_tokens = _usage.get("completion_tokens", 0)
@ -797,6 +817,7 @@ def response_cost_calculator(
        TextCompletionResponse,
        HttpxBinaryResponseContent,
        RerankResponse,
        ResponsesAPIResponse,
    ],
    model: str,
    custom_llm_provider: Optional[str],
--- a/litellm/files/main.py
+++ b/litellm/files/main.py
@ -25,7 +25,7 @@ from litellm.types.llms.openai import (
    HttpxBinaryResponseContent,
 )
 from litellm.types.router import *
-from litellm.utils import supports_httpx_timeout
+from litellm.utils import get_litellm_params, supports_httpx_timeout
 ####### ENVIRONMENT VARIABLES ###################
 openai_files_instance = OpenAIFilesAPI()
@ -546,6 +546,7 @@ def create_file(
    try:
        _is_async = kwargs.pop("acreate_file", False) is True
        optional_params = GenericLiteLLMParams(**kwargs)
        litellm_params_dict = get_litellm_params(**kwargs)
        ### TIMEOUT LOGIC ###
        timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
@ -630,6 +631,7 @@ def create_file(
                timeout=timeout,
                max_retries=optional_params.max_retries,
                create_file_data=_create_file_request,
                litellm_params=litellm_params_dict,
            )
        elif custom_llm_provider == "vertex_ai":
            api_base = optional_params.api_base or ""
--- a/litellm/integrations/custom_logger.py
+++ b/litellm/integrations/custom_logger.py
@ -239,6 +239,7 @@ class CustomLogger:  # https://docs.litellm.ai/docs/observability/custom_callbac
            "image_generation",
            "moderation",
            "audio_transcription",
            "responses",
        ],
    ) -> Any:
        pass
--- a/litellm/litellm_core_utils/credential_accessor.py
+++ b/litellm/litellm_core_utils/credential_accessor.py
@ -0,0 +1,34 @@
 """Utils for accessing credentials."""
 from typing import List
 import litellm
 from litellm.types.utils import CredentialItem
 class CredentialAccessor:
    @staticmethod
    def get_credential_values(credential_name: str) -> dict:
        """Safe accessor for credentials."""
        if not litellm.credential_list:
            return {}
        for credential in litellm.credential_list:
            if credential.credential_name == credential_name:
                return credential.credential_values.copy()
        return {}
    @staticmethod
    def upsert_credentials(credentials: List[CredentialItem]):
        """Add a credential to the list of credentials."""
        credential_names = [cred.credential_name for cred in litellm.credential_list]
        for credential in credentials:
            if credential.credential_name in credential_names:
                # Find and replace the existing credential in the list
                for i, existing_cred in enumerate(litellm.credential_list):
                    if existing_cred.credential_name == credential.credential_name:
                        litellm.credential_list[i] = credential
                        break
            else:
                litellm.credential_list.append(credential)
--- a/litellm/litellm_core_utils/get_litellm_params.py
+++ b/litellm/litellm_core_utils/get_litellm_params.py
@ -59,6 +59,7 @@ def get_litellm_params(
    ssl_verify: Optional[bool] = None,
    merge_reasoning_content_in_choices: Optional[bool] = None,
    api_version: Optional[str] = None,
    max_retries: Optional[int] = None,
    **kwargs,
 ) -> dict:
    litellm_params = {
@ -101,5 +102,13 @@ def get_litellm_params(
        "ssl_verify": ssl_verify,
        "merge_reasoning_content_in_choices": merge_reasoning_content_in_choices,
        "api_version": api_version,
        "azure_ad_token": kwargs.get("azure_ad_token"),
        "tenant_id": kwargs.get("tenant_id"),
        "client_id": kwargs.get("client_id"),
        "client_secret": kwargs.get("client_secret"),
        "azure_username": kwargs.get("azure_username"),
        "azure_password": kwargs.get("azure_password"),
        "max_retries": max_retries,
        "timeout": kwargs.get("timeout"),
    }
    return litellm_params
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@ -39,11 +39,14 @@ from litellm.litellm_core_utils.redact_messages import (
    redact_message_input_output_from_custom_logger,
    redact_message_input_output_from_logging,
 )
 from litellm.responses.utils import ResponseAPILoggingUtils
 from litellm.types.llms.openai import (
    AllMessageValues,
    Batch,
    FineTuningJob,
    HttpxBinaryResponseContent,
    ResponseCompletedEvent,
    ResponsesAPIResponse,
 )
 from litellm.types.rerank import RerankResponse
 from litellm.types.router import SPECIAL_MODEL_INFO_PARAMS
@ -851,6 +854,8 @@ class Logging(LiteLLMLoggingBaseClass):
            RerankResponse,
            Batch,
            FineTuningJob,
            ResponsesAPIResponse,
            ResponseCompletedEvent,
        ],
        cache_hit: Optional[bool] = None,
    ) -> Optional[float]:
@ -1000,7 +1005,7 @@ class Logging(LiteLLMLoggingBaseClass):
                standard_logging_object is None
                and result is not None
                and self.stream is not True
-            ):  # handle streaming separately
+            ):
                if (
                    isinstance(result, ModelResponse)
                    or isinstance(result, ModelResponseStream)
@ -1012,6 +1017,7 @@ class Logging(LiteLLMLoggingBaseClass):
                    or isinstance(result, RerankResponse)
                    or isinstance(result, FineTuningJob)
                    or isinstance(result, LiteLLMBatch)
                    or isinstance(result, ResponsesAPIResponse)
                ):
                    ## HIDDEN PARAMS ##
                    hidden_params = getattr(result, "_hidden_params", {})
@ -1111,7 +1117,7 @@ class Logging(LiteLLMLoggingBaseClass):
            ## BUILD COMPLETE STREAMED RESPONSE
            complete_streaming_response: Optional[
-                Union[ModelResponse, TextCompletionResponse]
+                Union[ModelResponse, TextCompletionResponse, ResponsesAPIResponse]
            ] = None
            if "complete_streaming_response" in self.model_call_details:
                return  # break out of this.
@ -1633,7 +1639,7 @@ class Logging(LiteLLMLoggingBaseClass):
        if "async_complete_streaming_response" in self.model_call_details:
            return  # break out of this.
        complete_streaming_response: Optional[
-            Union[ModelResponse, TextCompletionResponse]
+            Union[ModelResponse, TextCompletionResponse, ResponsesAPIResponse]
        ] = self._get_assembled_streaming_response(
            result=result,
            start_time=start_time,
@ -2343,16 +2349,24 @@ class Logging(LiteLLMLoggingBaseClass):
    def _get_assembled_streaming_response(
        self,
-        result: Union[ModelResponse, TextCompletionResponse, ModelResponseStream, Any],
+        result: Union[
            ModelResponse,
            TextCompletionResponse,
            ModelResponseStream,
            ResponseCompletedEvent,
            Any,
        ],
        start_time: datetime.datetime,
        end_time: datetime.datetime,
        is_async: bool,
        streaming_chunks: List[Any],
-    ) -> Optional[Union[ModelResponse, TextCompletionResponse]]:
+    ) -> Optional[Union[ModelResponse, TextCompletionResponse, ResponsesAPIResponse]]:
        if isinstance(result, ModelResponse):
            return result
        elif isinstance(result, TextCompletionResponse):
            return result
        elif isinstance(result, ResponseCompletedEvent):
            return result.response
        elif isinstance(result, ModelResponseStream):
            complete_streaming_response: Optional[
                Union[ModelResponse, TextCompletionResponse]
@ -3111,6 +3125,12 @@ class StandardLoggingPayloadSetup:
        elif isinstance(usage, Usage):
            return usage
        elif isinstance(usage, dict):
            if ResponseAPILoggingUtils._is_response_api_usage(usage):
                return (
                    ResponseAPILoggingUtils._transform_response_api_usage_to_chat_usage(
                        usage
                    )
                )
            return Usage(**usage)
        raise ValueError(f"usage is required, got={usage} of type {type(usage)}")
--- a/litellm/llms/azure/assistants.py
+++ b/litellm/llms/azure/assistants.py
@ -1,4 +1,4 @@
-from typing import Coroutine, Iterable, Literal, Optional, Union
+from typing import Any, Coroutine, Dict, Iterable, Literal, Optional, Union
 import httpx
 from openai import AsyncAzureOpenAI, AzureOpenAI
@ -18,10 +18,10 @@ from ...types.llms.openai import (
    SyncCursorPage,
    Thread,
 )
-from ..base import BaseLLM
+from .common_utils import BaseAzureLLM
-class AzureAssistantsAPI(BaseLLM):
+class AzureAssistantsAPI(BaseAzureLLM):
    def __init__(self) -> None:
        super().__init__()
@ -34,18 +34,17 @@ class AzureAssistantsAPI(BaseLLM):
        timeout: Union[float, httpx.Timeout],
        max_retries: Optional[int],
        client: Optional[AzureOpenAI] = None,
        litellm_params: Optional[dict] = None,
    ) -> AzureOpenAI:
        received_args = locals()
        if client is None:
-            data = {}
+            azure_client_params = self.initialize_azure_sdk_client(
-            for k, v in received_args.items():
+                litellm_params=litellm_params or {},
-                if k == "self" or k == "client":
+                api_key=api_key,
-                    pass
+                api_base=api_base,
-                elif k == "api_base" and v is not None:
+                model_name="",
-                    data["azure_endpoint"] = v
+                api_version=api_version,
-                elif v is not None:
+            )
-                    data[k] = v
+            azure_openai_client = AzureOpenAI(**azure_client_params)  # type: ignore
            azure_openai_client = AzureOpenAI(**data)  # type: ignore
        else:
            azure_openai_client = client
@ -60,18 +59,18 @@ class AzureAssistantsAPI(BaseLLM):
        timeout: Union[float, httpx.Timeout],
        max_retries: Optional[int],
        client: Optional[AsyncAzureOpenAI] = None,
        litellm_params: Optional[dict] = None,
    ) -> AsyncAzureOpenAI:
        received_args = locals()
        if client is None:
-            data = {}
+            azure_client_params = self.initialize_azure_sdk_client(
-            for k, v in received_args.items():
+                litellm_params=litellm_params or {},
-                if k == "self" or k == "client":
+                api_key=api_key,
-                    pass
+                api_base=api_base,
-                elif k == "api_base" and v is not None:
+                model_name="",
-                    data["azure_endpoint"] = v
+                api_version=api_version,
-                elif v is not None:
+            )
-                    data[k] = v
+
-            azure_openai_client = AsyncAzureOpenAI(**data)
+            azure_openai_client = AsyncAzureOpenAI(**azure_client_params)
            # azure_openai_client = AsyncAzureOpenAI(**data)  # type: ignore
        else:
            azure_openai_client = client
@ -89,6 +88,7 @@ class AzureAssistantsAPI(BaseLLM):
        timeout: Union[float, httpx.Timeout],
        max_retries: Optional[int],
        client: Optional[AsyncAzureOpenAI],
        litellm_params: Optional[dict] = None,
    ) -> AsyncCursorPage[Assistant]:
        azure_openai_client = self.async_get_azure_client(
            api_key=api_key,
@ -98,6 +98,7 @@ class AzureAssistantsAPI(BaseLLM):
            timeout=timeout,
            max_retries=max_retries,
            client=client,
            litellm_params=litellm_params,
        )
        response = await azure_openai_client.beta.assistants.list()
@ -146,6 +147,7 @@ class AzureAssistantsAPI(BaseLLM):
        max_retries: Optional[int],
        client=None,
        aget_assistants=None,
        litellm_params: Optional[dict] = None,
    ):
        if aget_assistants is not None and aget_assistants is True:
            return self.async_get_assistants(
@ -156,6 +158,7 @@ class AzureAssistantsAPI(BaseLLM):
                timeout=timeout,
                max_retries=max_retries,
                client=client,
                litellm_params=litellm_params,
            )
        azure_openai_client = self.get_azure_client(
            api_key=api_key,
@ -165,6 +168,7 @@ class AzureAssistantsAPI(BaseLLM):
            max_retries=max_retries,
            client=client,
            api_version=api_version,
            litellm_params=litellm_params,
        )
        response = azure_openai_client.beta.assistants.list()
@ -184,6 +188,7 @@ class AzureAssistantsAPI(BaseLLM):
        timeout: Union[float, httpx.Timeout],
        max_retries: Optional[int],
        client: Optional[AsyncAzureOpenAI] = None,
        litellm_params: Optional[dict] = None,
    ) -> OpenAIMessage:
        openai_client = self.async_get_azure_client(
            api_key=api_key,
@ -193,6 +198,7 @@ class AzureAssistantsAPI(BaseLLM):
            timeout=timeout,
            max_retries=max_retries,
            client=client,
            litellm_params=litellm_params,
        )
        thread_message: OpenAIMessage = await openai_client.beta.threads.messages.create(  # type: ignore
@ -222,6 +228,7 @@ class AzureAssistantsAPI(BaseLLM):
        max_retries: Optional[int],
        client: Optional[AsyncAzureOpenAI],
        a_add_message: Literal[True],
        litellm_params: Optional[dict] = None,
    ) -> Coroutine[None, None, OpenAIMessage]:
        ...
@ -238,6 +245,7 @@ class AzureAssistantsAPI(BaseLLM):
        max_retries: Optional[int],
        client: Optional[AzureOpenAI],
        a_add_message: Optional[Literal[False]],
        litellm_params: Optional[dict] = None,
    ) -> OpenAIMessage:
        ...
@ -255,6 +263,7 @@ class AzureAssistantsAPI(BaseLLM):
        max_retries: Optional[int],
        client=None,
        a_add_message: Optional[bool] = None,
        litellm_params: Optional[dict] = None,
    ):
        if a_add_message is not None and a_add_message is True:
            return self.a_add_message(
@ -267,6 +276,7 @@ class AzureAssistantsAPI(BaseLLM):
                timeout=timeout,
                max_retries=max_retries,
                client=client,
                litellm_params=litellm_params,
            )
        openai_client = self.get_azure_client(
            api_key=api_key,
@ -300,6 +310,7 @@ class AzureAssistantsAPI(BaseLLM):
        timeout: Union[float, httpx.Timeout],
        max_retries: Optional[int],
        client: Optional[AsyncAzureOpenAI] = None,
        litellm_params: Optional[dict] = None,
    ) -> AsyncCursorPage[OpenAIMessage]:
        openai_client = self.async_get_azure_client(
            api_key=api_key,
@ -309,6 +320,7 @@ class AzureAssistantsAPI(BaseLLM):
            timeout=timeout,
            max_retries=max_retries,
            client=client,
            litellm_params=litellm_params,
        )
        response = await openai_client.beta.threads.messages.list(thread_id=thread_id)
@ -329,6 +341,7 @@ class AzureAssistantsAPI(BaseLLM):
        max_retries: Optional[int],
        client: Optional[AsyncAzureOpenAI],
        aget_messages: Literal[True],
        litellm_params: Optional[dict] = None,
    ) -> Coroutine[None, None, AsyncCursorPage[OpenAIMessage]]:
        ...
@ -344,6 +357,7 @@ class AzureAssistantsAPI(BaseLLM):
        max_retries: Optional[int],
        client: Optional[AzureOpenAI],
        aget_messages: Optional[Literal[False]],
        litellm_params: Optional[dict] = None,
    ) -> SyncCursorPage[OpenAIMessage]:
        ...
@ -360,6 +374,7 @@ class AzureAssistantsAPI(BaseLLM):
        max_retries: Optional[int],
        client=None,
        aget_messages=None,
        litellm_params: Optional[dict] = None,
    ):
        if aget_messages is not None and aget_messages is True:
            return self.async_get_messages(
@ -371,6 +386,7 @@ class AzureAssistantsAPI(BaseLLM):
                timeout=timeout,
                max_retries=max_retries,
                client=client,
                litellm_params=litellm_params,
            )
        openai_client = self.get_azure_client(
            api_key=api_key,
@ -380,6 +396,7 @@ class AzureAssistantsAPI(BaseLLM):
            timeout=timeout,
            max_retries=max_retries,
            client=client,
            litellm_params=litellm_params,
        )
        response = openai_client.beta.threads.messages.list(thread_id=thread_id)
@ -399,6 +416,7 @@ class AzureAssistantsAPI(BaseLLM):
        max_retries: Optional[int],
        client: Optional[AsyncAzureOpenAI],
        messages: Optional[Iterable[OpenAICreateThreadParamsMessage]],
        litellm_params: Optional[dict] = None,
    ) -> Thread:
        openai_client = self.async_get_azure_client(
            api_key=api_key,
@ -408,6 +426,7 @@ class AzureAssistantsAPI(BaseLLM):
            timeout=timeout,
            max_retries=max_retries,
            client=client,
            litellm_params=litellm_params,
        )
        data = {}
@ -435,6 +454,7 @@ class AzureAssistantsAPI(BaseLLM):
        messages: Optional[Iterable[OpenAICreateThreadParamsMessage]],
        client: Optional[AsyncAzureOpenAI],
        acreate_thread: Literal[True],
        litellm_params: Optional[dict] = None,
    ) -> Coroutine[None, None, Thread]:
        ...
@ -451,6 +471,7 @@ class AzureAssistantsAPI(BaseLLM):
        messages: Optional[Iterable[OpenAICreateThreadParamsMessage]],
        client: Optional[AzureOpenAI],
        acreate_thread: Optional[Literal[False]],
        litellm_params: Optional[dict] = None,
    ) -> Thread:
        ...
@ -468,6 +489,7 @@ class AzureAssistantsAPI(BaseLLM):
        messages: Optional[Iterable[OpenAICreateThreadParamsMessage]],
        client=None,
        acreate_thread=None,
        litellm_params: Optional[dict] = None,
    ):
        """
        Here's an example:
@ -490,6 +512,7 @@ class AzureAssistantsAPI(BaseLLM):
                max_retries=max_retries,
                client=client,
                messages=messages,
                litellm_params=litellm_params,
            )
        azure_openai_client = self.get_azure_client(
            api_key=api_key,
@ -499,6 +522,7 @@ class AzureAssistantsAPI(BaseLLM):
            timeout=timeout,
            max_retries=max_retries,
            client=client,
            litellm_params=litellm_params,
        )
        data = {}
@ -521,6 +545,7 @@ class AzureAssistantsAPI(BaseLLM):
        timeout: Union[float, httpx.Timeout],
        max_retries: Optional[int],
        client: Optional[AsyncAzureOpenAI],
        litellm_params: Optional[dict] = None,
    ) -> Thread:
        openai_client = self.async_get_azure_client(
            api_key=api_key,
@ -530,6 +555,7 @@ class AzureAssistantsAPI(BaseLLM):
            timeout=timeout,
            max_retries=max_retries,
            client=client,
            litellm_params=litellm_params,
        )
        response = await openai_client.beta.threads.retrieve(thread_id=thread_id)
@ -550,6 +576,7 @@ class AzureAssistantsAPI(BaseLLM):
        max_retries: Optional[int],
        client: Optional[AsyncAzureOpenAI],
        aget_thread: Literal[True],
        litellm_params: Optional[dict] = None,
    ) -> Coroutine[None, None, Thread]:
        ...
@ -565,6 +592,7 @@ class AzureAssistantsAPI(BaseLLM):
        max_retries: Optional[int],
        client: Optional[AzureOpenAI],
        aget_thread: Optional[Literal[False]],
        litellm_params: Optional[dict] = None,
    ) -> Thread:
        ...
@ -581,6 +609,7 @@ class AzureAssistantsAPI(BaseLLM):
        max_retries: Optional[int],
        client=None,
        aget_thread=None,
        litellm_params: Optional[dict] = None,
    ):
        if aget_thread is not None and aget_thread is True:
            return self.async_get_thread(
@ -592,6 +621,7 @@ class AzureAssistantsAPI(BaseLLM):
                timeout=timeout,
                max_retries=max_retries,
                client=client,
                litellm_params=litellm_params,
            )
        openai_client = self.get_azure_client(
            api_key=api_key,
@ -601,6 +631,7 @@ class AzureAssistantsAPI(BaseLLM):
            timeout=timeout,
            max_retries=max_retries,
            client=client,
            litellm_params=litellm_params,
        )
        response = openai_client.beta.threads.retrieve(thread_id=thread_id)
@ -618,7 +649,7 @@ class AzureAssistantsAPI(BaseLLM):
        assistant_id: str,
        additional_instructions: Optional[str],
        instructions: Optional[str],
-        metadata: Optional[object],
+        metadata: Optional[Dict],
        model: Optional[str],
        stream: Optional[bool],
        tools: Optional[Iterable[AssistantToolParam]],
@ -629,6 +660,7 @@ class AzureAssistantsAPI(BaseLLM):
        timeout: Union[float, httpx.Timeout],
        max_retries: Optional[int],
        client: Optional[AsyncAzureOpenAI],
        litellm_params: Optional[dict] = None,
    ) -> Run:
        openai_client = self.async_get_azure_client(
            api_key=api_key,
@ -638,6 +670,7 @@ class AzureAssistantsAPI(BaseLLM):
            api_version=api_version,
            azure_ad_token=azure_ad_token,
            client=client,
            litellm_params=litellm_params,
        )
        response = await openai_client.beta.threads.runs.create_and_poll(  # type: ignore
@ -645,7 +678,7 @@ class AzureAssistantsAPI(BaseLLM):
            assistant_id=assistant_id,
            additional_instructions=additional_instructions,
            instructions=instructions,
-            metadata=metadata,
+            metadata=metadata,  # type: ignore
            model=model,
            tools=tools,
        )
@ -659,12 +692,13 @@ class AzureAssistantsAPI(BaseLLM):
        assistant_id: str,
        additional_instructions: Optional[str],
        instructions: Optional[str],
-        metadata: Optional[object],
+        metadata: Optional[Dict],
        model: Optional[str],
        tools: Optional[Iterable[AssistantToolParam]],
        event_handler: Optional[AssistantEventHandler],
        litellm_params: Optional[dict] = None,
    ) -> AsyncAssistantStreamManager[AsyncAssistantEventHandler]:
-        data = {
+        data: Dict[str, Any] = {
            "thread_id": thread_id,
            "assistant_id": assistant_id,
            "additional_instructions": additional_instructions,
@ -684,12 +718,13 @@ class AzureAssistantsAPI(BaseLLM):
        assistant_id: str,
        additional_instructions: Optional[str],
        instructions: Optional[str],
-        metadata: Optional[object],
+        metadata: Optional[Dict],
        model: Optional[str],
        tools: Optional[Iterable[AssistantToolParam]],
        event_handler: Optional[AssistantEventHandler],
        litellm_params: Optional[dict] = None,
    ) -> AssistantStreamManager[AssistantEventHandler]:
-        data = {
+        data: Dict[str, Any] = {
            "thread_id": thread_id,
            "assistant_id": assistant_id,
            "additional_instructions": additional_instructions,
@ -711,7 +746,7 @@ class AzureAssistantsAPI(BaseLLM):
        assistant_id: str,
        additional_instructions: Optional[str],
        instructions: Optional[str],
-        metadata: Optional[object],
+        metadata: Optional[Dict],
        model: Optional[str],
        stream: Optional[bool],
        tools: Optional[Iterable[AssistantToolParam]],
@ -733,7 +768,7 @@ class AzureAssistantsAPI(BaseLLM):
        assistant_id: str,
        additional_instructions: Optional[str],
        instructions: Optional[str],
-        metadata: Optional[object],
+        metadata: Optional[Dict],
        model: Optional[str],
        stream: Optional[bool],
        tools: Optional[Iterable[AssistantToolParam]],
@ -756,7 +791,7 @@ class AzureAssistantsAPI(BaseLLM):
        assistant_id: str,
        additional_instructions: Optional[str],
        instructions: Optional[str],
-        metadata: Optional[object],
+        metadata: Optional[Dict],
        model: Optional[str],
        stream: Optional[bool],
        tools: Optional[Iterable[AssistantToolParam]],
@ -769,6 +804,7 @@ class AzureAssistantsAPI(BaseLLM):
        client=None,
        arun_thread=None,
        event_handler: Optional[AssistantEventHandler] = None,
        litellm_params: Optional[dict] = None,
    ):
        if arun_thread is not None and arun_thread is True:
            if stream is not None and stream is True:
@ -780,6 +816,7 @@ class AzureAssistantsAPI(BaseLLM):
                    timeout=timeout,
                    max_retries=max_retries,
                    client=client,
                    litellm_params=litellm_params,
                )
                return self.async_run_thread_stream(
                    client=azure_client,
@ -791,13 +828,14 @@ class AzureAssistantsAPI(BaseLLM):
                    model=model,
                    tools=tools,
                    event_handler=event_handler,
                    litellm_params=litellm_params,
                )
            return self.arun_thread(
                thread_id=thread_id,
                assistant_id=assistant_id,
                additional_instructions=additional_instructions,
                instructions=instructions,
-                metadata=metadata,
+                metadata=metadata,  # type: ignore
                model=model,
                stream=stream,
                tools=tools,
@ -808,6 +846,7 @@ class AzureAssistantsAPI(BaseLLM):
                timeout=timeout,
                max_retries=max_retries,
                client=client,
                litellm_params=litellm_params,
            )
        openai_client = self.get_azure_client(
            api_key=api_key,
@ -817,6 +856,7 @@ class AzureAssistantsAPI(BaseLLM):
            timeout=timeout,
            max_retries=max_retries,
            client=client,
            litellm_params=litellm_params,
        )
        if stream is not None and stream is True:
@ -830,6 +870,7 @@ class AzureAssistantsAPI(BaseLLM):
                model=model,
                tools=tools,
                event_handler=event_handler,
                litellm_params=litellm_params,
            )
        response = openai_client.beta.threads.runs.create_and_poll(  # type: ignore
@ -837,7 +878,7 @@ class AzureAssistantsAPI(BaseLLM):
            assistant_id=assistant_id,
            additional_instructions=additional_instructions,
            instructions=instructions,
-            metadata=metadata,
+            metadata=metadata,  # type: ignore
            model=model,
            tools=tools,
        )
@ -855,6 +896,7 @@ class AzureAssistantsAPI(BaseLLM):
        max_retries: Optional[int],
        client: Optional[AsyncAzureOpenAI],
        create_assistant_data: dict,
        litellm_params: Optional[dict] = None,
    ) -> Assistant:
        azure_openai_client = self.async_get_azure_client(
            api_key=api_key,
@ -864,6 +906,7 @@ class AzureAssistantsAPI(BaseLLM):
            timeout=timeout,
            max_retries=max_retries,
            client=client,
            litellm_params=litellm_params,
        )
        response = await azure_openai_client.beta.assistants.create(
@ -882,6 +925,7 @@ class AzureAssistantsAPI(BaseLLM):
        create_assistant_data: dict,
        client=None,
        async_create_assistants=None,
        litellm_params: Optional[dict] = None,
    ):
        if async_create_assistants is not None and async_create_assistants is True:
            return self.async_create_assistants(
@ -893,6 +937,7 @@ class AzureAssistantsAPI(BaseLLM):
                max_retries=max_retries,
                client=client,
                create_assistant_data=create_assistant_data,
                litellm_params=litellm_params,
            )
        azure_openai_client = self.get_azure_client(
            api_key=api_key,
@ -902,6 +947,7 @@ class AzureAssistantsAPI(BaseLLM):
            timeout=timeout,
            max_retries=max_retries,
            client=client,
            litellm_params=litellm_params,
        )
        response = azure_openai_client.beta.assistants.create(**create_assistant_data)
@ -918,6 +964,7 @@ class AzureAssistantsAPI(BaseLLM):
        max_retries: Optional[int],
        client: Optional[AsyncAzureOpenAI],
        assistant_id: str,
        litellm_params: Optional[dict] = None,
    ):
        azure_openai_client = self.async_get_azure_client(
            api_key=api_key,
@ -927,6 +974,7 @@ class AzureAssistantsAPI(BaseLLM):
            timeout=timeout,
            max_retries=max_retries,
            client=client,
            litellm_params=litellm_params,
        )
        response = await azure_openai_client.beta.assistants.delete(
@ -945,6 +993,7 @@ class AzureAssistantsAPI(BaseLLM):
        assistant_id: str,
        async_delete_assistants: Optional[bool] = None,
        client=None,
        litellm_params: Optional[dict] = None,
    ):
        if async_delete_assistants is not None and async_delete_assistants is True:
            return self.async_delete_assistant(
@ -956,6 +1005,7 @@ class AzureAssistantsAPI(BaseLLM):
                max_retries=max_retries,
                client=client,
                assistant_id=assistant_id,
                litellm_params=litellm_params,
            )
        azure_openai_client = self.get_azure_client(
            api_key=api_key,
@ -965,6 +1015,7 @@ class AzureAssistantsAPI(BaseLLM):
            timeout=timeout,
            max_retries=max_retries,
            client=client,
            litellm_params=litellm_params,
        )
        response = azure_openai_client.beta.assistants.delete(assistant_id=assistant_id)
--- a/litellm/llms/azure/audio_transcriptions.py
+++ b/litellm/llms/azure/audio_transcriptions.py
@ -13,11 +13,7 @@ from litellm.utils import (
    extract_duration_from_srt_or_vtt,
 )
-from .azure import (
+from .azure import AzureChatCompletion
    AzureChatCompletion,
    get_azure_ad_token_from_oidc,
    select_azure_base_url_or_endpoint,
 )
 class AzureAudioTranscription(AzureChatCompletion):
@ -36,29 +32,18 @@ class AzureAudioTranscription(AzureChatCompletion):
        client=None,
        azure_ad_token: Optional[str] = None,
        atranscription: bool = False,
        litellm_params: Optional[dict] = None,
    ) -> TranscriptionResponse:
        data = {"model": model, "file": audio_file, **optional_params}
        # init AzureOpenAI Client
-        azure_client_params = {
+        azure_client_params = self.initialize_azure_sdk_client(
-            "api_version": api_version,
+            litellm_params=litellm_params or {},
-            "azure_endpoint": api_base,
+            api_key=api_key,
-            "azure_deployment": model,
+            model_name=model,
-            "timeout": timeout,
+            api_version=api_version,
-        }
+            api_base=api_base,
        azure_client_params = select_azure_base_url_or_endpoint(
            azure_client_params=azure_client_params
        )
        if api_key is not None:
            azure_client_params["api_key"] = api_key
        elif azure_ad_token is not None:
            if azure_ad_token.startswith("oidc/"):
                azure_ad_token = get_azure_ad_token_from_oidc(azure_ad_token)
            azure_client_params["azure_ad_token"] = azure_ad_token
        if max_retries is not None:
            azure_client_params["max_retries"] = max_retries
        if atranscription is True:
            return self.async_audio_transcriptions(  # type: ignore
@ -128,7 +113,6 @@ class AzureAudioTranscription(AzureChatCompletion):
            if client is None:
                async_azure_client = AsyncAzureOpenAI(
                    **azure_client_params,
                    http_client=litellm.aclient_session,
                )
            else:
                async_azure_client = client
--- a/litellm/llms/azure/azure.py
+++ b/litellm/llms/azure/azure.py
@ -1,6 +1,5 @@
 import asyncio
 import json
 import os
 import time
 from typing import Any, Callable, Dict, List, Literal, Optional, Union
@ -8,7 +7,6 @@ import httpx  # type: ignore
 from openai import APITimeoutError, AsyncAzureOpenAI, AzureOpenAI
 import litellm
 from litellm.caching.caching import DualCache
 from litellm.constants import DEFAULT_MAX_RETRIES
 from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
 from litellm.llms.custom_httpx.http_handler import (
@ -25,15 +23,18 @@ from litellm.types.utils import (
 from litellm.utils import (
    CustomStreamWrapper,
    convert_to_model_response_object,
    get_secret,
    modify_url,
 )
 from ...types.llms.openai import HttpxBinaryResponseContent
 from ..base import BaseLLM
-from .common_utils import AzureOpenAIError, process_azure_headers
+from .common_utils import (
-
+    AzureOpenAIError,
-azure_ad_cache = DualCache()
+    BaseAzureLLM,
    get_azure_ad_token_from_oidc,
    process_azure_headers,
    select_azure_base_url_or_endpoint,
 )
 class AzureOpenAIAssistantsAPIConfig:
@ -98,93 +99,6 @@ class AzureOpenAIAssistantsAPIConfig:
        return optional_params
 def select_azure_base_url_or_endpoint(azure_client_params: dict):
    azure_endpoint = azure_client_params.get("azure_endpoint", None)
    if azure_endpoint is not None:
        # see : https://github.com/openai/openai-python/blob/3d61ed42aba652b547029095a7eb269ad4e1e957/src/openai/lib/azure.py#L192
        if "/openai/deployments" in azure_endpoint:
            # this is base_url, not an azure_endpoint
            azure_client_params["base_url"] = azure_endpoint
            azure_client_params.pop("azure_endpoint")
    return azure_client_params
 def get_azure_ad_token_from_oidc(azure_ad_token: str):
    azure_client_id = os.getenv("AZURE_CLIENT_ID", None)
    azure_tenant_id = os.getenv("AZURE_TENANT_ID", None)
    azure_authority_host = os.getenv(
        "AZURE_AUTHORITY_HOST", "https://login.microsoftonline.com"
    )
    if azure_client_id is None or azure_tenant_id is None:
        raise AzureOpenAIError(
            status_code=422,
            message="AZURE_CLIENT_ID and AZURE_TENANT_ID must be set",
        )
    oidc_token = get_secret(azure_ad_token)
    if oidc_token is None:
        raise AzureOpenAIError(
            status_code=401,
            message="OIDC token could not be retrieved from secret manager.",
        )
    azure_ad_token_cache_key = json.dumps(
        {
            "azure_client_id": azure_client_id,
            "azure_tenant_id": azure_tenant_id,
            "azure_authority_host": azure_authority_host,
            "oidc_token": oidc_token,
        }
    )
    azure_ad_token_access_token = azure_ad_cache.get_cache(azure_ad_token_cache_key)
    if azure_ad_token_access_token is not None:
        return azure_ad_token_access_token
    client = litellm.module_level_client
    req_token = client.post(
        f"{azure_authority_host}/{azure_tenant_id}/oauth2/v2.0/token",
        data={
            "client_id": azure_client_id,
            "grant_type": "client_credentials",
            "scope": "https://cognitiveservices.azure.com/.default",
            "client_assertion_type": "urn:ietf:params:oauth:client-assertion-type:jwt-bearer",
            "client_assertion": oidc_token,
        },
    )
    if req_token.status_code != 200:
        raise AzureOpenAIError(
            status_code=req_token.status_code,
            message=req_token.text,
        )
    azure_ad_token_json = req_token.json()
    azure_ad_token_access_token = azure_ad_token_json.get("access_token", None)
    azure_ad_token_expires_in = azure_ad_token_json.get("expires_in", None)
    if azure_ad_token_access_token is None:
        raise AzureOpenAIError(
            status_code=422, message="Azure AD Token access_token not returned"
        )
    if azure_ad_token_expires_in is None:
        raise AzureOpenAIError(
            status_code=422, message="Azure AD Token expires_in not returned"
        )
    azure_ad_cache.set_cache(
        key=azure_ad_token_cache_key,
        value=azure_ad_token_access_token,
        ttl=azure_ad_token_expires_in,
    )
    return azure_ad_token_access_token
 def _check_dynamic_azure_params(
    azure_client_params: dict,
    azure_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]],
@ -206,7 +120,7 @@ def _check_dynamic_azure_params(
    return False
-class AzureChatCompletion(BaseLLM):
+class AzureChatCompletion(BaseAzureLLM, BaseLLM):
    def __init__(self) -> None:
        super().__init__()
@ -238,27 +152,16 @@ class AzureChatCompletion(BaseLLM):
        timeout: Union[float, httpx.Timeout],
        client: Optional[Any],
        client_type: Literal["sync", "async"],
        litellm_params: Optional[dict] = None,
    ):
        # init AzureOpenAI Client
-        azure_client_params: Dict[str, Any] = {
+        azure_client_params: Dict[str, Any] = self.initialize_azure_sdk_client(
-            "api_version": api_version,
+            litellm_params=litellm_params or {},
-            "azure_endpoint": api_base,
+            api_key=api_key,
-            "azure_deployment": model,
+            model_name=model,
-            "http_client": litellm.client_session,
+            api_version=api_version,
-            "max_retries": max_retries,
+            api_base=api_base,
            "timeout": timeout,
        }
        azure_client_params = select_azure_base_url_or_endpoint(
            azure_client_params=azure_client_params
        )
        if api_key is not None:
            azure_client_params["api_key"] = api_key
        elif azure_ad_token is not None:
            if azure_ad_token.startswith("oidc/"):
                azure_ad_token = get_azure_ad_token_from_oidc(azure_ad_token)
            azure_client_params["azure_ad_token"] = azure_ad_token
        elif azure_ad_token_provider is not None:
            azure_client_params["azure_ad_token_provider"] = azure_ad_token_provider
        if client is None:
            if client_type == "sync":
                azure_client = AzureOpenAI(**azure_client_params)  # type: ignore
@ -357,6 +260,13 @@ class AzureChatCompletion(BaseLLM):
                max_retries = DEFAULT_MAX_RETRIES
            json_mode: Optional[bool] = optional_params.pop("json_mode", False)
            azure_client_params = self.initialize_azure_sdk_client(
                litellm_params=litellm_params or {},
                api_key=api_key,
                api_base=api_base,
                model_name=model,
                api_version=api_version,
            )
            ### CHECK IF CLOUDFLARE AI GATEWAY ###
            ### if so - set the model as part of the base url
            if "gateway.ai.cloudflare.com" in api_base:
@ -417,6 +327,7 @@ class AzureChatCompletion(BaseLLM):
                        timeout=timeout,
                        client=client,
                        max_retries=max_retries,
                        azure_client_params=azure_client_params,
                    )
                else:
                    return self.acompletion(
@ -434,6 +345,7 @@ class AzureChatCompletion(BaseLLM):
                        logging_obj=logging_obj,
                        max_retries=max_retries,
                        convert_tool_call_to_json_mode=json_mode,
                        azure_client_params=azure_client_params,
                    )
            elif "stream" in optional_params and optional_params["stream"] is True:
                return self.streaming(
@ -470,28 +382,6 @@ class AzureChatCompletion(BaseLLM):
                        status_code=422, message="max retries must be an int"
                    )
                # init AzureOpenAI Client
                azure_client_params = {
                    "api_version": api_version,
                    "azure_endpoint": api_base,
                    "azure_deployment": model,
                    "http_client": litellm.client_session,
                    "max_retries": max_retries,
                    "timeout": timeout,
                }
                azure_client_params = select_azure_base_url_or_endpoint(
                    azure_client_params=azure_client_params
                )
                if api_key is not None:
                    azure_client_params["api_key"] = api_key
                elif azure_ad_token is not None:
                    if azure_ad_token.startswith("oidc/"):
                        azure_ad_token = get_azure_ad_token_from_oidc(azure_ad_token)
                    azure_client_params["azure_ad_token"] = azure_ad_token
                elif azure_ad_token_provider is not None:
                    azure_client_params["azure_ad_token_provider"] = (
                        azure_ad_token_provider
                    )
                if (
                    client is None
                    or not isinstance(client, AzureOpenAI)
@ -566,30 +456,10 @@ class AzureChatCompletion(BaseLLM):
        azure_ad_token_provider: Optional[Callable] = None,
        convert_tool_call_to_json_mode: Optional[bool] = None,
        client=None,  # this is the AsyncAzureOpenAI
        azure_client_params: dict = {},
    ):
        response = None
        try:
            # init AzureOpenAI Client
            azure_client_params = {
                "api_version": api_version,
                "azure_endpoint": api_base,
                "azure_deployment": model,
                "http_client": litellm.aclient_session,
                "max_retries": max_retries,
                "timeout": timeout,
            }
            azure_client_params = select_azure_base_url_or_endpoint(
                azure_client_params=azure_client_params
            )
            if api_key is not None:
                azure_client_params["api_key"] = api_key
            elif azure_ad_token is not None:
                if azure_ad_token.startswith("oidc/"):
                    azure_ad_token = get_azure_ad_token_from_oidc(azure_ad_token)
                azure_client_params["azure_ad_token"] = azure_ad_token
            elif azure_ad_token_provider is not None:
                azure_client_params["azure_ad_token_provider"] = azure_ad_token_provider
            # setting Azure client
            if client is None or dynamic_params:
                azure_client = AsyncAzureOpenAI(**azure_client_params)
@ -747,28 +617,9 @@ class AzureChatCompletion(BaseLLM):
        azure_ad_token: Optional[str] = None,
        azure_ad_token_provider: Optional[Callable] = None,
        client=None,
        azure_client_params: dict = {},
    ):
        try:
            # init AzureOpenAI Client
            azure_client_params = {
                "api_version": api_version,
                "azure_endpoint": api_base,
                "azure_deployment": model,
                "http_client": litellm.aclient_session,
                "max_retries": max_retries,
                "timeout": timeout,
            }
            azure_client_params = select_azure_base_url_or_endpoint(
                azure_client_params=azure_client_params
            )
            if api_key is not None:
                azure_client_params["api_key"] = api_key
            elif azure_ad_token is not None:
                if azure_ad_token.startswith("oidc/"):
                    azure_ad_token = get_azure_ad_token_from_oidc(azure_ad_token)
                azure_client_params["azure_ad_token"] = azure_ad_token
            elif azure_ad_token_provider is not None:
                azure_client_params["azure_ad_token_provider"] = azure_ad_token_provider
            if client is None or dynamic_params:
                azure_client = AsyncAzureOpenAI(**azure_client_params)
            else:
@ -833,6 +684,7 @@ class AzureChatCompletion(BaseLLM):
    ):
        response = None
        try:
            if client is None:
                openai_aclient = AsyncAzureOpenAI(**azure_client_params)
            else:
@ -884,6 +736,7 @@ class AzureChatCompletion(BaseLLM):
        client=None,
        aembedding=None,
        headers: Optional[dict] = None,
        litellm_params: Optional[dict] = None,
    ) -> EmbeddingResponse:
        if headers:
            optional_params["extra_headers"] = headers
@ -899,29 +752,14 @@ class AzureChatCompletion(BaseLLM):
                )
            # init AzureOpenAI Client
            azure_client_params = {
                "api_version": api_version,
                "azure_endpoint": api_base,
                "azure_deployment": model,
                "max_retries": max_retries,
                "timeout": timeout,
            }
            azure_client_params = select_azure_base_url_or_endpoint(
                azure_client_params=azure_client_params
            )
            if aembedding:
                azure_client_params["http_client"] = litellm.aclient_session
            else:
                azure_client_params["http_client"] = litellm.client_session
            if api_key is not None:
                azure_client_params["api_key"] = api_key
            elif azure_ad_token is not None:
                if azure_ad_token.startswith("oidc/"):
                    azure_ad_token = get_azure_ad_token_from_oidc(azure_ad_token)
                azure_client_params["azure_ad_token"] = azure_ad_token
            elif azure_ad_token_provider is not None:
                azure_client_params["azure_ad_token_provider"] = azure_ad_token_provider
            azure_client_params = self.initialize_azure_sdk_client(
                litellm_params=litellm_params or {},
                api_key=api_key,
                model_name=model,
                api_version=api_version,
                api_base=api_base,
            )
            ## LOGGING
            logging_obj.pre_call(
                input=input,
@ -1281,6 +1119,7 @@ class AzureChatCompletion(BaseLLM):
        azure_ad_token_provider: Optional[Callable] = None,
        client=None,
        aimg_generation=None,
        litellm_params: Optional[dict] = None,
    ) -> ImageResponse:
        try:
            if model and len(model) > 0:
@ -1305,25 +1144,13 @@ class AzureChatCompletion(BaseLLM):
                )
            # init AzureOpenAI Client
-            azure_client_params: Dict[str, Any] = {
+            azure_client_params: Dict[str, Any] = self.initialize_azure_sdk_client(
-                "api_version": api_version,
+                litellm_params=litellm_params or {},
-                "azure_endpoint": api_base,
+                api_key=api_key,
-                "azure_deployment": model,
+                model_name=model or "",
-                "max_retries": max_retries,
+                api_version=api_version,
-                "timeout": timeout,
+                api_base=api_base,
            }
            azure_client_params = select_azure_base_url_or_endpoint(
                azure_client_params=azure_client_params
            )
            if api_key is not None:
                azure_client_params["api_key"] = api_key
            elif azure_ad_token is not None:
                if azure_ad_token.startswith("oidc/"):
                    azure_ad_token = get_azure_ad_token_from_oidc(azure_ad_token)
                azure_client_params["azure_ad_token"] = azure_ad_token
            elif azure_ad_token_provider is not None:
                azure_client_params["azure_ad_token_provider"] = azure_ad_token_provider
            if aimg_generation is True:
                return self.aimage_generation(data=data, input=input, logging_obj=logging_obj, model_response=model_response, api_key=api_key, client=client, azure_client_params=azure_client_params, timeout=timeout, headers=headers)  # type: ignore
@ -1386,6 +1213,7 @@ class AzureChatCompletion(BaseLLM):
        azure_ad_token_provider: Optional[Callable] = None,
        aspeech: Optional[bool] = None,
        client=None,
        litellm_params: Optional[dict] = None,
    ) -> HttpxBinaryResponseContent:
        max_retries = optional_params.pop("max_retries", 2)
@ -1404,6 +1232,7 @@ class AzureChatCompletion(BaseLLM):
                max_retries=max_retries,
                timeout=timeout,
                client=client,
                litellm_params=litellm_params,
            )  # type: ignore
        azure_client: AzureOpenAI = self._get_sync_azure_client(
@ -1417,6 +1246,7 @@ class AzureChatCompletion(BaseLLM):
            timeout=timeout,
            client=client,
            client_type="sync",
            litellm_params=litellm_params,
        )  # type: ignore
        response = azure_client.audio.speech.create(
@ -1441,6 +1271,7 @@ class AzureChatCompletion(BaseLLM):
        max_retries: int,
        timeout: Union[float, httpx.Timeout],
        client=None,
        litellm_params: Optional[dict] = None,
    ) -> HttpxBinaryResponseContent:
        azure_client: AsyncAzureOpenAI = self._get_sync_azure_client(
@ -1454,6 +1285,7 @@ class AzureChatCompletion(BaseLLM):
            timeout=timeout,
            client=client,
            client_type="async",
            litellm_params=litellm_params,
        )  # type: ignore
        azure_response = await azure_client.audio.speech.create(
--- a/litellm/llms/azure/batches/handler.py
+++ b/litellm/llms/azure/batches/handler.py
@ -6,7 +6,6 @@ from typing import Any, Coroutine, Optional, Union, cast
 import httpx
 import litellm
 from litellm.llms.azure.azure import AsyncAzureOpenAI, AzureOpenAI
 from litellm.types.llms.openai import (
    Batch,
@ -16,8 +15,10 @@ from litellm.types.llms.openai import (
 )
 from litellm.types.utils import LiteLLMBatch
 from ..common_utils import BaseAzureLLM
-class AzureBatchesAPI:
+
 class AzureBatchesAPI(BaseAzureLLM):
    """
    Azure methods to support for batches
    - create_batch()
@ -29,38 +30,6 @@ class AzureBatchesAPI:
    def __init__(self) -> None:
        super().__init__()
    def get_azure_openai_client(
        self,
        api_key: Optional[str],
        api_base: Optional[str],
        timeout: Union[float, httpx.Timeout],
        max_retries: Optional[int],
        api_version: Optional[str] = None,
        client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
        _is_async: bool = False,
    ) -> Optional[Union[AzureOpenAI, AsyncAzureOpenAI]]:
        received_args = locals()
        openai_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None
        if client is None:
            data = {}
            for k, v in received_args.items():
                if k == "self" or k == "client" or k == "_is_async":
                    pass
                elif k == "api_base" and v is not None:
                    data["azure_endpoint"] = v
                elif v is not None:
                    data[k] = v
            if "api_version" not in data:
                data["api_version"] = litellm.AZURE_DEFAULT_API_VERSION
            if _is_async is True:
                openai_client = AsyncAzureOpenAI(**data)
            else:
                openai_client = AzureOpenAI(**data)  # type: ignore
        else:
            openai_client = client
        return openai_client
    async def acreate_batch(
        self,
        create_batch_data: CreateBatchRequest,
@ -79,16 +48,16 @@ class AzureBatchesAPI:
        timeout: Union[float, httpx.Timeout],
        max_retries: Optional[int],
        client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
        litellm_params: Optional[dict] = None,
    ) -> Union[LiteLLMBatch, Coroutine[Any, Any, LiteLLMBatch]]:
        azure_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = (
            self.get_azure_openai_client(
                api_key=api_key,
                api_base=api_base,
                timeout=timeout,
                api_version=api_version,
                max_retries=max_retries,
                client=client,
                _is_async=_is_async,
                litellm_params=litellm_params or {},
            )
        )
        if azure_client is None:
@ -125,16 +94,16 @@ class AzureBatchesAPI:
        timeout: Union[float, httpx.Timeout],
        max_retries: Optional[int],
        client: Optional[AzureOpenAI] = None,
        litellm_params: Optional[dict] = None,
    ):
        azure_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = (
            self.get_azure_openai_client(
                api_key=api_key,
                api_base=api_base,
                api_version=api_version,
                timeout=timeout,
                max_retries=max_retries,
                client=client,
                _is_async=_is_async,
                litellm_params=litellm_params or {},
            )
        )
        if azure_client is None:
@ -173,16 +142,16 @@ class AzureBatchesAPI:
        timeout: Union[float, httpx.Timeout],
        max_retries: Optional[int],
        client: Optional[AzureOpenAI] = None,
        litellm_params: Optional[dict] = None,
    ):
        azure_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = (
            self.get_azure_openai_client(
                api_key=api_key,
                api_base=api_base,
                api_version=api_version,
                timeout=timeout,
                max_retries=max_retries,
                client=client,
                _is_async=_is_async,
                litellm_params=litellm_params or {},
            )
        )
        if azure_client is None:
@ -212,16 +181,16 @@ class AzureBatchesAPI:
        after: Optional[str] = None,
        limit: Optional[int] = None,
        client: Optional[AzureOpenAI] = None,
        litellm_params: Optional[dict] = None,
    ):
        azure_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = (
            self.get_azure_openai_client(
                api_key=api_key,
                api_base=api_base,
                timeout=timeout,
                max_retries=max_retries,
                api_version=api_version,
                client=client,
                _is_async=_is_async,
                litellm_params=litellm_params or {},
            )
        )
        if azure_client is None:
--- a/litellm/llms/azure/chat/o_series_handler.py
+++ b/litellm/llms/azure/chat/o_series_handler.py
@ -4,50 +4,69 @@ Handler file for calls to Azure OpenAI's o1/o3 family of models
 Written separately to handle faking streaming for o1 and o3 models.
 """
-from typing import Optional, Union
+from typing import Any, Callable, Optional, Union
 import httpx
-from openai import AsyncAzureOpenAI, AsyncOpenAI, AzureOpenAI, OpenAI
+
 from litellm.types.utils import ModelResponse
 from ...openai.openai import OpenAIChatCompletion
-from ..common_utils import get_azure_openai_client
+from ..common_utils import BaseAzureLLM
-class AzureOpenAIO1ChatCompletion(OpenAIChatCompletion):
+class AzureOpenAIO1ChatCompletion(BaseAzureLLM, OpenAIChatCompletion):
-    def _get_openai_client(
+    def completion(
        self,
-        is_async: bool,
+        model_response: ModelResponse,
        timeout: Union[float, httpx.Timeout],
        optional_params: dict,
        litellm_params: dict,
        logging_obj: Any,
        model: Optional[str] = None,
        messages: Optional[list] = None,
        print_verbose: Optional[Callable] = None,
        api_key: Optional[str] = None,
        api_base: Optional[str] = None,
        api_version: Optional[str] = None,
-        timeout: Union[float, httpx.Timeout] = httpx.Timeout(None),
+        dynamic_params: Optional[bool] = None,
-        max_retries: Optional[int] = 2,
+        azure_ad_token: Optional[str] = None,
        acompletion: bool = False,
        logger_fn=None,
        headers: Optional[dict] = None,
        custom_prompt_dict: dict = {},
        client=None,
        organization: Optional[str] = None,
-        client: Optional[
+        custom_llm_provider: Optional[str] = None,
-            Union[OpenAI, AsyncOpenAI, AzureOpenAI, AsyncAzureOpenAI]
+        drop_params: Optional[bool] = None,
        ] = None,
    ) -> Optional[
        Union[
            OpenAI,
            AsyncOpenAI,
            AzureOpenAI,
            AsyncAzureOpenAI,
        ]
    ]:
        # Override to use Azure-specific client initialization
        if not isinstance(client, AzureOpenAI) and not isinstance(
            client, AsyncAzureOpenAI
    ):
-            client = None
+        client = self.get_azure_openai_client(
-
+            litellm_params=litellm_params,
        return get_azure_openai_client(
            api_key=api_key,
            api_base=api_base,
            timeout=timeout,
            max_retries=max_retries,
            organization=organization,
            api_version=api_version,
            client=client,
-            _is_async=is_async,
+            _is_async=acompletion,
        )
        return super().completion(
            model_response=model_response,
            timeout=timeout,
            optional_params=optional_params,
            litellm_params=litellm_params,
            logging_obj=logging_obj,
            model=model,
            messages=messages,
            print_verbose=print_verbose,
            api_key=api_key,
            api_base=api_base,
            api_version=api_version,
            dynamic_params=dynamic_params,
            azure_ad_token=azure_ad_token,
            acompletion=acompletion,
            logger_fn=logger_fn,
            headers=headers,
            custom_prompt_dict=custom_prompt_dict,
            client=client,
            organization=organization,
            custom_llm_provider=custom_llm_provider,
            drop_params=drop_params,
        )
--- a/litellm/llms/azure/common_utils.py
+++ b/litellm/llms/azure/common_utils.py
@ -1,3 +1,5 @@
 import json
 import os
 from typing import Callable, Optional, Union
 import httpx
@ -5,9 +7,15 @@ from openai import AsyncAzureOpenAI, AzureOpenAI
 import litellm
 from litellm._logging import verbose_logger
 from litellm.caching.caching import DualCache
 from litellm.llms.base_llm.chat.transformation import BaseLLMException
 from litellm.secret_managers.get_azure_ad_token_provider import (
    get_azure_ad_token_provider,
 )
 from litellm.secret_managers.main import get_secret_str
 azure_ad_cache = DualCache()
 class AzureOpenAIError(BaseLLMException):
    def __init__(
@ -29,39 +37,6 @@ class AzureOpenAIError(BaseLLMException):
        )
 def get_azure_openai_client(
    api_key: Optional[str],
    api_base: Optional[str],
    timeout: Union[float, httpx.Timeout],
    max_retries: Optional[int],
    api_version: Optional[str] = None,
    organization: Optional[str] = None,
    client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
    _is_async: bool = False,
 ) -> Optional[Union[AzureOpenAI, AsyncAzureOpenAI]]:
    received_args = locals()
    openai_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None
    if client is None:
        data = {}
        for k, v in received_args.items():
            if k == "self" or k == "client" or k == "_is_async":
                pass
            elif k == "api_base" and v is not None:
                data["azure_endpoint"] = v
            elif v is not None:
                data[k] = v
        if "api_version" not in data:
            data["api_version"] = litellm.AZURE_DEFAULT_API_VERSION
        if _is_async is True:
            openai_client = AsyncAzureOpenAI(**data)
        else:
            openai_client = AzureOpenAI(**data)  # type: ignore
    else:
        openai_client = client
    return openai_client
 def process_azure_headers(headers: Union[httpx.Headers, dict]) -> dict:
    openai_headers = {}
    if "x-ratelimit-limit-requests" in headers:
@ -180,3 +155,199 @@ def get_azure_ad_token_from_username_password(
    verbose_logger.debug("token_provider %s", token_provider)
    return token_provider
 def get_azure_ad_token_from_oidc(azure_ad_token: str):
    azure_client_id = os.getenv("AZURE_CLIENT_ID", None)
    azure_tenant_id = os.getenv("AZURE_TENANT_ID", None)
    azure_authority_host = os.getenv(
        "AZURE_AUTHORITY_HOST", "https://login.microsoftonline.com"
    )
    if azure_client_id is None or azure_tenant_id is None:
        raise AzureOpenAIError(
            status_code=422,
            message="AZURE_CLIENT_ID and AZURE_TENANT_ID must be set",
        )
    oidc_token = get_secret_str(azure_ad_token)
    if oidc_token is None:
        raise AzureOpenAIError(
            status_code=401,
            message="OIDC token could not be retrieved from secret manager.",
        )
    azure_ad_token_cache_key = json.dumps(
        {
            "azure_client_id": azure_client_id,
            "azure_tenant_id": azure_tenant_id,
            "azure_authority_host": azure_authority_host,
            "oidc_token": oidc_token,
        }
    )
    azure_ad_token_access_token = azure_ad_cache.get_cache(azure_ad_token_cache_key)
    if azure_ad_token_access_token is not None:
        return azure_ad_token_access_token
    client = litellm.module_level_client
    req_token = client.post(
        f"{azure_authority_host}/{azure_tenant_id}/oauth2/v2.0/token",
        data={
            "client_id": azure_client_id,
            "grant_type": "client_credentials",
            "scope": "https://cognitiveservices.azure.com/.default",
            "client_assertion_type": "urn:ietf:params:oauth:client-assertion-type:jwt-bearer",
            "client_assertion": oidc_token,
        },
    )
    if req_token.status_code != 200:
        raise AzureOpenAIError(
            status_code=req_token.status_code,
            message=req_token.text,
        )
    azure_ad_token_json = req_token.json()
    azure_ad_token_access_token = azure_ad_token_json.get("access_token", None)
    azure_ad_token_expires_in = azure_ad_token_json.get("expires_in", None)
    if azure_ad_token_access_token is None:
        raise AzureOpenAIError(
            status_code=422, message="Azure AD Token access_token not returned"
        )
    if azure_ad_token_expires_in is None:
        raise AzureOpenAIError(
            status_code=422, message="Azure AD Token expires_in not returned"
        )
    azure_ad_cache.set_cache(
        key=azure_ad_token_cache_key,
        value=azure_ad_token_access_token,
        ttl=azure_ad_token_expires_in,
    )
    return azure_ad_token_access_token
 def select_azure_base_url_or_endpoint(azure_client_params: dict):
    azure_endpoint = azure_client_params.get("azure_endpoint", None)
    if azure_endpoint is not None:
        # see : https://github.com/openai/openai-python/blob/3d61ed42aba652b547029095a7eb269ad4e1e957/src/openai/lib/azure.py#L192
        if "/openai/deployments" in azure_endpoint:
            # this is base_url, not an azure_endpoint
            azure_client_params["base_url"] = azure_endpoint
            azure_client_params.pop("azure_endpoint")
    return azure_client_params
 class BaseAzureLLM:
    def get_azure_openai_client(
        self,
        litellm_params: dict,
        api_key: Optional[str],
        api_base: Optional[str],
        api_version: Optional[str] = None,
        client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
        _is_async: bool = False,
    ) -> Optional[Union[AzureOpenAI, AsyncAzureOpenAI]]:
        openai_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None
        if client is None:
            azure_client_params = self.initialize_azure_sdk_client(
                litellm_params=litellm_params,
                api_key=api_key,
                api_base=api_base,
                model_name="",
                api_version=api_version,
            )
            if _is_async is True:
                openai_client = AsyncAzureOpenAI(**azure_client_params)
            else:
                openai_client = AzureOpenAI(**azure_client_params)  # type: ignore
        else:
            openai_client = client
        return openai_client
    def initialize_azure_sdk_client(
        self,
        litellm_params: dict,
        api_key: Optional[str],
        api_base: Optional[str],
        model_name: str,
        api_version: Optional[str],
    ) -> dict:
        azure_ad_token_provider: Optional[Callable[[], str]] = None
        # If we have api_key, then we have higher priority
        azure_ad_token = litellm_params.get("azure_ad_token")
        tenant_id = litellm_params.get("tenant_id")
        client_id = litellm_params.get("client_id")
        client_secret = litellm_params.get("client_secret")
        azure_username = litellm_params.get("azure_username")
        azure_password = litellm_params.get("azure_password")
        max_retries = litellm_params.get("max_retries")
        timeout = litellm_params.get("timeout")
        if not api_key and tenant_id and client_id and client_secret:
            verbose_logger.debug("Using Azure AD Token Provider for Azure Auth")
            azure_ad_token_provider = get_azure_ad_token_from_entrata_id(
                tenant_id=tenant_id,
                client_id=client_id,
                client_secret=client_secret,
            )
        if azure_username and azure_password and client_id:
            azure_ad_token_provider = get_azure_ad_token_from_username_password(
                azure_username=azure_username,
                azure_password=azure_password,
                client_id=client_id,
            )
        if azure_ad_token is not None and azure_ad_token.startswith("oidc/"):
            azure_ad_token = get_azure_ad_token_from_oidc(azure_ad_token)
        elif (
            not api_key
            and azure_ad_token_provider is None
            and litellm.enable_azure_ad_token_refresh is True
        ):
            try:
                azure_ad_token_provider = get_azure_ad_token_provider()
            except ValueError:
                verbose_logger.debug("Azure AD Token Provider could not be used.")
        if api_version is None:
            api_version = os.getenv(
                "AZURE_API_VERSION", litellm.AZURE_DEFAULT_API_VERSION
            )
        _api_key = api_key
        if _api_key is not None and isinstance(_api_key, str):
            # only show first 5 chars of api_key
            _api_key = _api_key[:8] + "*" * 15
        verbose_logger.debug(
            f"Initializing Azure OpenAI Client for {model_name}, Api Base: {str(api_base)}, Api Key:{_api_key}"
        )
        azure_client_params = {
            "api_key": api_key,
            "azure_endpoint": api_base,
            "api_version": api_version,
            "azure_ad_token": azure_ad_token,
            "azure_ad_token_provider": azure_ad_token_provider,
            "http_client": litellm.client_session,
        }
        if max_retries is not None:
            azure_client_params["max_retries"] = max_retries
        if timeout is not None:
            azure_client_params["timeout"] = timeout
        if azure_ad_token_provider is not None:
            azure_client_params["azure_ad_token_provider"] = azure_ad_token_provider
        # this decides if we should set azure_endpoint or base_url on Azure OpenAI Client
        # required to support GPT-4 vision enhancements, since base_url needs to be set on Azure OpenAI Client
        azure_client_params = select_azure_base_url_or_endpoint(
            azure_client_params=azure_client_params
        )
        return azure_client_params
--- a/litellm/llms/azure/completion/handler.py
+++ b/litellm/llms/azure/completion/handler.py
@ -6,9 +6,8 @@ import litellm
 from litellm.litellm_core_utils.prompt_templates.factory import prompt_factory
 from litellm.utils import CustomStreamWrapper, ModelResponse, TextCompletionResponse
 from ...base import BaseLLM
 from ...openai.completion.transformation import OpenAITextCompletionConfig
-from ..common_utils import AzureOpenAIError
+from ..common_utils import AzureOpenAIError, BaseAzureLLM
 openai_text_completion_config = OpenAITextCompletionConfig()
@ -25,7 +24,7 @@ def select_azure_base_url_or_endpoint(azure_client_params: dict):
    return azure_client_params
-class AzureTextCompletion(BaseLLM):
+class AzureTextCompletion(BaseAzureLLM):
    def __init__(self) -> None:
        super().__init__()
@ -60,7 +59,6 @@ class AzureTextCompletion(BaseLLM):
        headers: Optional[dict] = None,
        client=None,
    ):
        super().completion()
        try:
            if model is None or messages is None:
                raise AzureOpenAIError(
@ -72,6 +70,14 @@ class AzureTextCompletion(BaseLLM):
                messages=messages, model=model, custom_llm_provider="azure_text"
            )
            azure_client_params = self.initialize_azure_sdk_client(
                litellm_params=litellm_params or {},
                api_key=api_key,
                model_name=model,
                api_version=api_version,
                api_base=api_base,
            )
            ### CHECK IF CLOUDFLARE AI GATEWAY ###
            ### if so - set the model as part of the base url
            if "gateway.ai.cloudflare.com" in api_base:
@ -118,6 +124,7 @@ class AzureTextCompletion(BaseLLM):
                        azure_ad_token=azure_ad_token,
                        timeout=timeout,
                        client=client,
                        azure_client_params=azure_client_params,
                    )
                else:
                    return self.acompletion(
@ -132,6 +139,7 @@ class AzureTextCompletion(BaseLLM):
                        client=client,
                        logging_obj=logging_obj,
                        max_retries=max_retries,
                        azure_client_params=azure_client_params,
                    )
            elif "stream" in optional_params and optional_params["stream"] is True:
                return self.streaming(
@ -144,6 +152,7 @@ class AzureTextCompletion(BaseLLM):
                    azure_ad_token=azure_ad_token,
                    timeout=timeout,
                    client=client,
                    azure_client_params=azure_client_params,
                )
            else:
                ## LOGGING
@ -165,22 +174,6 @@ class AzureTextCompletion(BaseLLM):
                        status_code=422, message="max retries must be an int"
                    )
                # init AzureOpenAI Client
                azure_client_params = {
                    "api_version": api_version,
                    "azure_endpoint": api_base,
                    "azure_deployment": model,
                    "http_client": litellm.client_session,
                    "max_retries": max_retries,
                    "timeout": timeout,
                    "azure_ad_token_provider": azure_ad_token_provider,
                }
                azure_client_params = select_azure_base_url_or_endpoint(
                    azure_client_params=azure_client_params
                )
                if api_key is not None:
                    azure_client_params["api_key"] = api_key
                elif azure_ad_token is not None:
                    azure_client_params["azure_ad_token"] = azure_ad_token
                if client is None:
                    azure_client = AzureOpenAI(**azure_client_params)
                else:
@ -240,26 +233,11 @@ class AzureTextCompletion(BaseLLM):
        max_retries: int,
        azure_ad_token: Optional[str] = None,
        client=None,  # this is the AsyncAzureOpenAI
        azure_client_params: dict = {},
    ):
        response = None
        try:
            # init AzureOpenAI Client
            azure_client_params = {
                "api_version": api_version,
                "azure_endpoint": api_base,
                "azure_deployment": model,
                "http_client": litellm.client_session,
                "max_retries": max_retries,
                "timeout": timeout,
            }
            azure_client_params = select_azure_base_url_or_endpoint(
                azure_client_params=azure_client_params
            )
            if api_key is not None:
                azure_client_params["api_key"] = api_key
            elif azure_ad_token is not None:
                azure_client_params["azure_ad_token"] = azure_ad_token
            # setting Azure client
            if client is None:
                azure_client = AsyncAzureOpenAI(**azure_client_params)
@ -312,6 +290,7 @@ class AzureTextCompletion(BaseLLM):
        timeout: Any,
        azure_ad_token: Optional[str] = None,
        client=None,
        azure_client_params: dict = {},
    ):
        max_retries = data.pop("max_retries", 2)
        if not isinstance(max_retries, int):
@ -319,21 +298,6 @@ class AzureTextCompletion(BaseLLM):
                status_code=422, message="max retries must be an int"
            )
        # init AzureOpenAI Client
        azure_client_params = {
            "api_version": api_version,
            "azure_endpoint": api_base,
            "azure_deployment": model,
            "http_client": litellm.client_session,
            "max_retries": max_retries,
            "timeout": timeout,
        }
        azure_client_params = select_azure_base_url_or_endpoint(
            azure_client_params=azure_client_params
        )
        if api_key is not None:
            azure_client_params["api_key"] = api_key
        elif azure_ad_token is not None:
            azure_client_params["azure_ad_token"] = azure_ad_token
        if client is None:
            azure_client = AzureOpenAI(**azure_client_params)
        else:
@ -375,24 +339,10 @@ class AzureTextCompletion(BaseLLM):
        timeout: Any,
        azure_ad_token: Optional[str] = None,
        client=None,
        azure_client_params: dict = {},
    ):
        try:
            # init AzureOpenAI Client
            azure_client_params = {
                "api_version": api_version,
                "azure_endpoint": api_base,
                "azure_deployment": model,
                "http_client": litellm.client_session,
                "max_retries": data.pop("max_retries", 2),
                "timeout": timeout,
            }
            azure_client_params = select_azure_base_url_or_endpoint(
                azure_client_params=azure_client_params
            )
            if api_key is not None:
                azure_client_params["api_key"] = api_key
            elif azure_ad_token is not None:
                azure_client_params["azure_ad_token"] = azure_ad_token
            if client is None:
                azure_client = AsyncAzureOpenAI(**azure_client_params)
            else:
--- a/litellm/llms/azure/files/handler.py
+++ b/litellm/llms/azure/files/handler.py
@ -5,13 +5,12 @@ from openai import AsyncAzureOpenAI, AzureOpenAI
 from openai.types.file_deleted import FileDeleted
 from litellm._logging import verbose_logger
 from litellm.llms.base import BaseLLM
 from litellm.types.llms.openai import *
-from ..common_utils import get_azure_openai_client
+from ..common_utils import BaseAzureLLM
-class AzureOpenAIFilesAPI(BaseLLM):
+class AzureOpenAIFilesAPI(BaseAzureLLM):
    """
    AzureOpenAI methods to support for batches
    - create_file()
@ -45,14 +44,15 @@ class AzureOpenAIFilesAPI(BaseLLM):
        timeout: Union[float, httpx.Timeout],
        max_retries: Optional[int],
        client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
        litellm_params: Optional[dict] = None,
    ) -> Union[FileObject, Coroutine[Any, Any, FileObject]]:
        openai_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = (
-            get_azure_openai_client(
+            self.get_azure_openai_client(
                litellm_params=litellm_params or {},
                api_key=api_key,
                api_base=api_base,
                api_version=api_version,
                timeout=timeout,
                max_retries=max_retries,
                client=client,
                _is_async=_is_async,
            )
@ -91,17 +91,16 @@ class AzureOpenAIFilesAPI(BaseLLM):
        max_retries: Optional[int],
        api_version: Optional[str] = None,
        client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
        litellm_params: Optional[dict] = None,
    ) -> Union[
        HttpxBinaryResponseContent, Coroutine[Any, Any, HttpxBinaryResponseContent]
    ]:
        openai_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = (
-            get_azure_openai_client(
+            self.get_azure_openai_client(
                litellm_params=litellm_params or {},
                api_key=api_key,
                api_base=api_base,
                timeout=timeout,
                api_version=api_version,
                max_retries=max_retries,
                organization=None,
                client=client,
                _is_async=_is_async,
            )
@ -144,14 +143,13 @@ class AzureOpenAIFilesAPI(BaseLLM):
        max_retries: Optional[int],
        api_version: Optional[str] = None,
        client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
        litellm_params: Optional[dict] = None,
    ):
        openai_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = (
-            get_azure_openai_client(
+            self.get_azure_openai_client(
                litellm_params=litellm_params or {},
                api_key=api_key,
                api_base=api_base,
                timeout=timeout,
                max_retries=max_retries,
                organization=None,
                api_version=api_version,
                client=client,
                _is_async=_is_async,
@ -197,14 +195,13 @@ class AzureOpenAIFilesAPI(BaseLLM):
        organization: Optional[str] = None,
        api_version: Optional[str] = None,
        client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
        litellm_params: Optional[dict] = None,
    ):
        openai_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = (
-            get_azure_openai_client(
+            self.get_azure_openai_client(
                litellm_params=litellm_params or {},
                api_key=api_key,
                api_base=api_base,
                timeout=timeout,
                max_retries=max_retries,
                organization=organization,
                api_version=api_version,
                client=client,
                _is_async=_is_async,
@ -252,14 +249,13 @@ class AzureOpenAIFilesAPI(BaseLLM):
        purpose: Optional[str] = None,
        api_version: Optional[str] = None,
        client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
        litellm_params: Optional[dict] = None,
    ):
        openai_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = (
-            get_azure_openai_client(
+            self.get_azure_openai_client(
                litellm_params=litellm_params or {},
                api_key=api_key,
                api_base=api_base,
                timeout=timeout,
                max_retries=max_retries,
                organization=None,  # openai param
                api_version=api_version,
                client=client,
                _is_async=_is_async,
--- a/litellm/llms/azure/fine_tuning/handler.py
+++ b/litellm/llms/azure/fine_tuning/handler.py
@ -3,11 +3,11 @@ from typing import Optional, Union
 import httpx
 from openai import AsyncAzureOpenAI, AsyncOpenAI, AzureOpenAI, OpenAI
-from litellm.llms.azure.files.handler import get_azure_openai_client
+from litellm.llms.azure.common_utils import BaseAzureLLM
 from litellm.llms.openai.fine_tuning.handler import OpenAIFineTuningAPI
-class AzureOpenAIFineTuningAPI(OpenAIFineTuningAPI):
+class AzureOpenAIFineTuningAPI(OpenAIFineTuningAPI, BaseAzureLLM):
    """
    AzureOpenAI methods to support fine tuning, inherits from OpenAIFineTuningAPI.
    """
@ -24,6 +24,7 @@ class AzureOpenAIFineTuningAPI(OpenAIFineTuningAPI):
        ] = None,
        _is_async: bool = False,
        api_version: Optional[str] = None,
        litellm_params: Optional[dict] = None,
    ) -> Optional[
        Union[
            OpenAI,
@ -36,12 +37,10 @@ class AzureOpenAIFineTuningAPI(OpenAIFineTuningAPI):
        if isinstance(client, OpenAI) or isinstance(client, AsyncOpenAI):
            client = None
-        return get_azure_openai_client(
+        return self.get_azure_openai_client(
            litellm_params=litellm_params or {},
            api_key=api_key,
            api_base=api_base,
            timeout=timeout,
            max_retries=max_retries,
            organization=organization,
            api_version=api_version,
            client=client,
            _is_async=_is_async,
--- a/litellm/llms/base_llm/responses/transformation.py
+++ b/litellm/llms/base_llm/responses/transformation.py
@ -0,0 +1,133 @@
 import types
 from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING, Any, Dict, Optional, Union
 import httpx
 from litellm.types.llms.openai import (
    ResponseInputParam,
    ResponsesAPIOptionalRequestParams,
    ResponsesAPIRequestParams,
    ResponsesAPIResponse,
    ResponsesAPIStreamingResponse,
 )
 from litellm.types.router import GenericLiteLLMParams
 if TYPE_CHECKING:
    from litellm.litellm_core_utils.litellm_logging import Logging as _LiteLLMLoggingObj
    from ..chat.transformation import BaseLLMException as _BaseLLMException
    LiteLLMLoggingObj = _LiteLLMLoggingObj
    BaseLLMException = _BaseLLMException
 else:
    LiteLLMLoggingObj = Any
    BaseLLMException = Any
 class BaseResponsesAPIConfig(ABC):
    def __init__(self):
        pass
    @classmethod
    def get_config(cls):
        return {
            k: v
            for k, v in cls.__dict__.items()
            if not k.startswith("__")
            and not k.startswith("_abc")
            and not isinstance(
                v,
                (
                    types.FunctionType,
                    types.BuiltinFunctionType,
                    classmethod,
                    staticmethod,
                ),
            )
            and v is not None
        }
    @abstractmethod
    def get_supported_openai_params(self, model: str) -> list:
        pass
    @abstractmethod
    def map_openai_params(
        self,
        response_api_optional_params: ResponsesAPIOptionalRequestParams,
        model: str,
        drop_params: bool,
    ) -> Dict:
        pass
    @abstractmethod
    def validate_environment(
        self,
        headers: dict,
        model: str,
        api_key: Optional[str] = None,
    ) -> dict:
        return {}
    @abstractmethod
    def get_complete_url(
        self,
        api_base: Optional[str],
        model: str,
        stream: Optional[bool] = None,
    ) -> str:
        """
        OPTIONAL
        Get the complete url for the request
        Some providers need `model` in `api_base`
        """
        if api_base is None:
            raise ValueError("api_base is required")
        return api_base
    @abstractmethod
    def transform_responses_api_request(
        self,
        model: str,
        input: Union[str, ResponseInputParam],
        response_api_optional_request_params: Dict,
        litellm_params: GenericLiteLLMParams,
        headers: dict,
    ) -> ResponsesAPIRequestParams:
        pass
    @abstractmethod
    def transform_response_api_response(
        self,
        model: str,
        raw_response: httpx.Response,
        logging_obj: LiteLLMLoggingObj,
    ) -> ResponsesAPIResponse:
        pass
    @abstractmethod
    def transform_streaming_response(
        self,
        model: str,
        parsed_chunk: dict,
        logging_obj: LiteLLMLoggingObj,
    ) -> ResponsesAPIStreamingResponse:
        """
        Transform a parsed streaming response chunk into a ResponsesAPIStreamingResponse
        """
        pass
    def get_error_class(
        self, error_message: str, status_code: int, headers: Union[dict, httpx.Headers]
    ) -> BaseLLMException:
        from ..chat.transformation import BaseLLMException
        raise BaseLLMException(
            status_code=status_code,
            message=error_message,
            headers=headers,
        )
--- a/litellm/llms/custom_httpx/llm_http_handler.py
+++ b/litellm/llms/custom_httpx/llm_http_handler.py
@ -1,6 +1,6 @@
 import io
 import json
-from typing import TYPE_CHECKING, Any, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Coroutine, Dict, Optional, Tuple, Union
 import httpx  # type: ignore
@ -11,13 +11,21 @@ import litellm.types.utils
 from litellm.llms.base_llm.chat.transformation import BaseConfig
 from litellm.llms.base_llm.embedding.transformation import BaseEmbeddingConfig
 from litellm.llms.base_llm.rerank.transformation import BaseRerankConfig
 from litellm.llms.base_llm.responses.transformation import BaseResponsesAPIConfig
 from litellm.llms.custom_httpx.http_handler import (
    AsyncHTTPHandler,
    HTTPHandler,
    _get_httpx_client,
    get_async_httpx_client,
 )
 from litellm.responses.streaming_iterator import (
    BaseResponsesAPIStreamingIterator,
    ResponsesAPIStreamingIterator,
    SyncResponsesAPIStreamingIterator,
 )
 from litellm.types.llms.openai import ResponseInputParam, ResponsesAPIResponse
 from litellm.types.rerank import OptionalRerankParams, RerankResponse
 from litellm.types.router import GenericLiteLLMParams
 from litellm.types.utils import EmbeddingResponse, FileTypes, TranscriptionResponse
 from litellm.utils import CustomStreamWrapper, ModelResponse, ProviderConfigManager
@ -956,8 +964,235 @@ class BaseLLMHTTPHandler:
            return returned_response
        return model_response
    def response_api_handler(
        self,
        model: str,
        input: Union[str, ResponseInputParam],
        responses_api_provider_config: BaseResponsesAPIConfig,
        response_api_optional_request_params: Dict,
        custom_llm_provider: str,
        litellm_params: GenericLiteLLMParams,
        logging_obj: LiteLLMLoggingObj,
        extra_headers: Optional[Dict[str, Any]] = None,
        extra_body: Optional[Dict[str, Any]] = None,
        timeout: Optional[Union[float, httpx.Timeout]] = None,
        client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
        _is_async: bool = False,
    ) -> Union[
        ResponsesAPIResponse,
        BaseResponsesAPIStreamingIterator,
        Coroutine[
            Any, Any, Union[ResponsesAPIResponse, BaseResponsesAPIStreamingIterator]
        ],
    ]:
        """
        Handles responses API requests.
        When _is_async=True, returns a coroutine instead of making the call directly.
        """
        if _is_async:
            # Return the async coroutine if called with _is_async=True
            return self.async_response_api_handler(
                model=model,
                input=input,
                responses_api_provider_config=responses_api_provider_config,
                response_api_optional_request_params=response_api_optional_request_params,
                custom_llm_provider=custom_llm_provider,
                litellm_params=litellm_params,
                logging_obj=logging_obj,
                extra_headers=extra_headers,
                extra_body=extra_body,
                timeout=timeout,
                client=client if isinstance(client, AsyncHTTPHandler) else None,
            )
        if client is None or not isinstance(client, HTTPHandler):
            sync_httpx_client = _get_httpx_client(
                params={"ssl_verify": litellm_params.get("ssl_verify", None)}
            )
        else:
            sync_httpx_client = client
        headers = responses_api_provider_config.validate_environment(
            api_key=litellm_params.api_key,
            headers=response_api_optional_request_params.get("extra_headers", {}) or {},
            model=model,
        )
        if extra_headers:
            headers.update(extra_headers)
        api_base = responses_api_provider_config.get_complete_url(
            api_base=litellm_params.api_base,
            model=model,
        )
        data = responses_api_provider_config.transform_responses_api_request(
            model=model,
            input=input,
            response_api_optional_request_params=response_api_optional_request_params,
            litellm_params=litellm_params,
            headers=headers,
        )
        ## LOGGING
        logging_obj.pre_call(
            input=input,
            api_key="",
            additional_args={
                "complete_input_dict": data,
                "api_base": api_base,
                "headers": headers,
            },
        )
        # Check if streaming is requested
        stream = response_api_optional_request_params.get("stream", False)
        try:
            if stream:
                # For streaming, use stream=True in the request
                response = sync_httpx_client.post(
                    url=api_base,
                    headers=headers,
                    data=json.dumps(data),
                    timeout=timeout
                    or response_api_optional_request_params.get("timeout"),
                    stream=True,
                )
                return SyncResponsesAPIStreamingIterator(
                    response=response,
                    model=model,
                    logging_obj=logging_obj,
                    responses_api_provider_config=responses_api_provider_config,
                )
            else:
                # For non-streaming requests
                response = sync_httpx_client.post(
                    url=api_base,
                    headers=headers,
                    data=json.dumps(data),
                    timeout=timeout
                    or response_api_optional_request_params.get("timeout"),
                )
        except Exception as e:
            raise self._handle_error(
                e=e,
                provider_config=responses_api_provider_config,
            )
        return responses_api_provider_config.transform_response_api_response(
            model=model,
            raw_response=response,
            logging_obj=logging_obj,
        )
    async def async_response_api_handler(
        self,
        model: str,
        input: Union[str, ResponseInputParam],
        responses_api_provider_config: BaseResponsesAPIConfig,
        response_api_optional_request_params: Dict,
        custom_llm_provider: str,
        litellm_params: GenericLiteLLMParams,
        logging_obj: LiteLLMLoggingObj,
        extra_headers: Optional[Dict[str, Any]] = None,
        extra_body: Optional[Dict[str, Any]] = None,
        timeout: Optional[Union[float, httpx.Timeout]] = None,
        client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
    ) -> Union[ResponsesAPIResponse, BaseResponsesAPIStreamingIterator]:
        """
        Async version of the responses API handler.
        Uses async HTTP client to make requests.
        """
        if client is None or not isinstance(client, AsyncHTTPHandler):
            async_httpx_client = get_async_httpx_client(
                llm_provider=litellm.LlmProviders(custom_llm_provider),
                params={"ssl_verify": litellm_params.get("ssl_verify", None)},
            )
        else:
            async_httpx_client = client
        headers = responses_api_provider_config.validate_environment(
            api_key=litellm_params.api_key,
            headers=response_api_optional_request_params.get("extra_headers", {}) or {},
            model=model,
        )
        if extra_headers:
            headers.update(extra_headers)
        api_base = responses_api_provider_config.get_complete_url(
            api_base=litellm_params.api_base,
            model=model,
        )
        data = responses_api_provider_config.transform_responses_api_request(
            model=model,
            input=input,
            response_api_optional_request_params=response_api_optional_request_params,
            litellm_params=litellm_params,
            headers=headers,
        )
        ## LOGGING
        logging_obj.pre_call(
            input=input,
            api_key="",
            additional_args={
                "complete_input_dict": data,
                "api_base": api_base,
                "headers": headers,
            },
        )
        # Check if streaming is requested
        stream = response_api_optional_request_params.get("stream", False)
        try:
            if stream:
                # For streaming, we need to use stream=True in the request
                response = await async_httpx_client.post(
                    url=api_base,
                    headers=headers,
                    data=json.dumps(data),
                    timeout=timeout
                    or response_api_optional_request_params.get("timeout"),
                    stream=True,
                )
                # Return the streaming iterator
                return ResponsesAPIStreamingIterator(
                    response=response,
                    model=model,
                    logging_obj=logging_obj,
                    responses_api_provider_config=responses_api_provider_config,
                )
            else:
                # For non-streaming, proceed as before
                response = await async_httpx_client.post(
                    url=api_base,
                    headers=headers,
                    data=json.dumps(data),
                    timeout=timeout
                    or response_api_optional_request_params.get("timeout"),
                )
        except Exception as e:
            raise self._handle_error(
                e=e,
                provider_config=responses_api_provider_config,
            )
        return responses_api_provider_config.transform_response_api_response(
            model=model,
            raw_response=response,
            logging_obj=logging_obj,
        )
    def _handle_error(
-        self, e: Exception, provider_config: Union[BaseConfig, BaseRerankConfig]
+        self,
        e: Exception,
        provider_config: Union[BaseConfig, BaseRerankConfig, BaseResponsesAPIConfig],
    ):
        status_code = getattr(e, "status_code", 500)
        error_headers = getattr(e, "headers", None)
--- a/litellm/llms/openai/fine_tuning/handler.py
+++ b/litellm/llms/openai/fine_tuning/handler.py
@ -27,6 +27,7 @@ class OpenAIFineTuningAPI:
        ] = None,
        _is_async: bool = False,
        api_version: Optional[str] = None,
        litellm_params: Optional[dict] = None,
    ) -> Optional[
        Union[
            OpenAI,
--- a/litellm/llms/openai/openai.py
+++ b/litellm/llms/openai/openai.py
@ -2650,7 +2650,7 @@ class OpenAIAssistantsAPI(BaseLLM):
        assistant_id: str,
        additional_instructions: Optional[str],
        instructions: Optional[str],
-        metadata: Optional[object],
+        metadata: Optional[Dict],
        model: Optional[str],
        stream: Optional[bool],
        tools: Optional[Iterable[AssistantToolParam]],
@ -2689,12 +2689,12 @@ class OpenAIAssistantsAPI(BaseLLM):
        assistant_id: str,
        additional_instructions: Optional[str],
        instructions: Optional[str],
-        metadata: Optional[object],
+        metadata: Optional[Dict],
        model: Optional[str],
        tools: Optional[Iterable[AssistantToolParam]],
        event_handler: Optional[AssistantEventHandler],
    ) -> AsyncAssistantStreamManager[AsyncAssistantEventHandler]:
-        data = {
+        data: Dict[str, Any] = {
            "thread_id": thread_id,
            "assistant_id": assistant_id,
            "additional_instructions": additional_instructions,
@ -2714,12 +2714,12 @@ class OpenAIAssistantsAPI(BaseLLM):
        assistant_id: str,
        additional_instructions: Optional[str],
        instructions: Optional[str],
-        metadata: Optional[object],
+        metadata: Optional[Dict],
        model: Optional[str],
        tools: Optional[Iterable[AssistantToolParam]],
        event_handler: Optional[AssistantEventHandler],
    ) -> AssistantStreamManager[AssistantEventHandler]:
-        data = {
+        data: Dict[str, Any] = {
            "thread_id": thread_id,
            "assistant_id": assistant_id,
            "additional_instructions": additional_instructions,
@ -2741,7 +2741,7 @@ class OpenAIAssistantsAPI(BaseLLM):
        assistant_id: str,
        additional_instructions: Optional[str],
        instructions: Optional[str],
-        metadata: Optional[object],
+        metadata: Optional[Dict],
        model: Optional[str],
        stream: Optional[bool],
        tools: Optional[Iterable[AssistantToolParam]],
@ -2763,7 +2763,7 @@ class OpenAIAssistantsAPI(BaseLLM):
        assistant_id: str,
        additional_instructions: Optional[str],
        instructions: Optional[str],
-        metadata: Optional[object],
+        metadata: Optional[Dict],
        model: Optional[str],
        stream: Optional[bool],
        tools: Optional[Iterable[AssistantToolParam]],
@ -2786,7 +2786,7 @@ class OpenAIAssistantsAPI(BaseLLM):
        assistant_id: str,
        additional_instructions: Optional[str],
        instructions: Optional[str],
-        metadata: Optional[object],
+        metadata: Optional[Dict],
        model: Optional[str],
        stream: Optional[bool],
        tools: Optional[Iterable[AssistantToolParam]],
--- a/litellm/llms/openai/responses/transformation.py
+++ b/litellm/llms/openai/responses/transformation.py
@ -0,0 +1,190 @@
 from typing import TYPE_CHECKING, Any, Dict, Optional, Union, cast
 import httpx
 import litellm
 from litellm._logging import verbose_logger
 from litellm.llms.base_llm.responses.transformation import BaseResponsesAPIConfig
 from litellm.secret_managers.main import get_secret_str
 from litellm.types.llms.openai import *
 from litellm.types.router import GenericLiteLLMParams
 from ..common_utils import OpenAIError
 if TYPE_CHECKING:
    from litellm.litellm_core_utils.litellm_logging import Logging as _LiteLLMLoggingObj
    LiteLLMLoggingObj = _LiteLLMLoggingObj
 else:
    LiteLLMLoggingObj = Any
 class OpenAIResponsesAPIConfig(BaseResponsesAPIConfig):
    def get_supported_openai_params(self, model: str) -> list:
        """
        All OpenAI Responses API params are supported
        """
        return [
            "input",
            "model",
            "include",
            "instructions",
            "max_output_tokens",
            "metadata",
            "parallel_tool_calls",
            "previous_response_id",
            "reasoning",
            "store",
            "stream",
            "temperature",
            "text",
            "tool_choice",
            "tools",
            "top_p",
            "truncation",
            "user",
            "extra_headers",
            "extra_query",
            "extra_body",
            "timeout",
        ]
    def map_openai_params(
        self,
        response_api_optional_params: ResponsesAPIOptionalRequestParams,
        model: str,
        drop_params: bool,
    ) -> Dict:
        """No mapping applied since inputs are in OpenAI spec already"""
        return dict(response_api_optional_params)
    def transform_responses_api_request(
        self,
        model: str,
        input: Union[str, ResponseInputParam],
        response_api_optional_request_params: Dict,
        litellm_params: GenericLiteLLMParams,
        headers: dict,
    ) -> ResponsesAPIRequestParams:
        """No transform applied since inputs are in OpenAI spec already"""
        return ResponsesAPIRequestParams(
            model=model, input=input, **response_api_optional_request_params
        )
    def transform_response_api_response(
        self,
        model: str,
        raw_response: httpx.Response,
        logging_obj: LiteLLMLoggingObj,
    ) -> ResponsesAPIResponse:
        """No transform applied since outputs are in OpenAI spec already"""
        try:
            raw_response_json = raw_response.json()
        except Exception:
            raise OpenAIError(
                message=raw_response.text, status_code=raw_response.status_code
            )
        return ResponsesAPIResponse(**raw_response_json)
    def validate_environment(
        self,
        headers: dict,
        model: str,
        api_key: Optional[str] = None,
    ) -> dict:
        api_key = (
            api_key
            or litellm.api_key
            or litellm.openai_key
            or get_secret_str("OPENAI_API_KEY")
        )
        headers.update(
            {
                "Authorization": f"Bearer {api_key}",
            }
        )
        return headers
    def get_complete_url(
        self,
        api_base: Optional[str],
        model: str,
        stream: Optional[bool] = None,
    ) -> str:
        """
        Get the endpoint for OpenAI responses API
        """
        api_base = (
            api_base
            or litellm.api_base
            or get_secret_str("OPENAI_API_BASE")
            or "https://api.openai.com/v1"
        )
        # Remove trailing slashes
        api_base = api_base.rstrip("/")
        return f"{api_base}/responses"
    def transform_streaming_response(
        self,
        model: str,
        parsed_chunk: dict,
        logging_obj: LiteLLMLoggingObj,
    ) -> ResponsesAPIStreamingResponse:
        """
        Transform a parsed streaming response chunk into a ResponsesAPIStreamingResponse
        """
        # Convert the dictionary to a properly typed ResponsesAPIStreamingResponse
        verbose_logger.debug("Raw OpenAI Chunk=%s", parsed_chunk)
        event_type = str(parsed_chunk.get("type"))
        event_pydantic_model = OpenAIResponsesAPIConfig.get_event_model_class(
            event_type=event_type
        )
        return event_pydantic_model(**parsed_chunk)
    @staticmethod
    def get_event_model_class(event_type: str) -> Any:
        """
        Returns the appropriate event model class based on the event type.
        Args:
            event_type (str): The type of event from the response chunk
        Returns:
            Any: The corresponding event model class
        Raises:
            ValueError: If the event type is unknown
        """
        event_models = {
            ResponsesAPIStreamEvents.RESPONSE_CREATED: ResponseCreatedEvent,
            ResponsesAPIStreamEvents.RESPONSE_IN_PROGRESS: ResponseInProgressEvent,
            ResponsesAPIStreamEvents.RESPONSE_COMPLETED: ResponseCompletedEvent,
            ResponsesAPIStreamEvents.RESPONSE_FAILED: ResponseFailedEvent,
            ResponsesAPIStreamEvents.RESPONSE_INCOMPLETE: ResponseIncompleteEvent,
            ResponsesAPIStreamEvents.OUTPUT_ITEM_ADDED: OutputItemAddedEvent,
            ResponsesAPIStreamEvents.OUTPUT_ITEM_DONE: OutputItemDoneEvent,
            ResponsesAPIStreamEvents.CONTENT_PART_ADDED: ContentPartAddedEvent,
            ResponsesAPIStreamEvents.CONTENT_PART_DONE: ContentPartDoneEvent,
            ResponsesAPIStreamEvents.OUTPUT_TEXT_DELTA: OutputTextDeltaEvent,
            ResponsesAPIStreamEvents.OUTPUT_TEXT_ANNOTATION_ADDED: OutputTextAnnotationAddedEvent,
            ResponsesAPIStreamEvents.OUTPUT_TEXT_DONE: OutputTextDoneEvent,
            ResponsesAPIStreamEvents.REFUSAL_DELTA: RefusalDeltaEvent,
            ResponsesAPIStreamEvents.REFUSAL_DONE: RefusalDoneEvent,
            ResponsesAPIStreamEvents.FUNCTION_CALL_ARGUMENTS_DELTA: FunctionCallArgumentsDeltaEvent,
            ResponsesAPIStreamEvents.FUNCTION_CALL_ARGUMENTS_DONE: FunctionCallArgumentsDoneEvent,
            ResponsesAPIStreamEvents.FILE_SEARCH_CALL_IN_PROGRESS: FileSearchCallInProgressEvent,
            ResponsesAPIStreamEvents.FILE_SEARCH_CALL_SEARCHING: FileSearchCallSearchingEvent,
            ResponsesAPIStreamEvents.FILE_SEARCH_CALL_COMPLETED: FileSearchCallCompletedEvent,
            ResponsesAPIStreamEvents.WEB_SEARCH_CALL_IN_PROGRESS: WebSearchCallInProgressEvent,
            ResponsesAPIStreamEvents.WEB_SEARCH_CALL_SEARCHING: WebSearchCallSearchingEvent,
            ResponsesAPIStreamEvents.WEB_SEARCH_CALL_COMPLETED: WebSearchCallCompletedEvent,
            ResponsesAPIStreamEvents.ERROR: ErrorEvent,
        }
        model_class = event_models.get(cast(ResponsesAPIStreamEvents, event_type))
        if not model_class:
            raise ValueError(f"Unknown event type: {event_type}")
        return model_class
--- a/litellm/main.py
+++ b/litellm/main.py
@ -1163,6 +1163,14 @@ def completion(  # type: ignore # noqa: PLR0915
                "merge_reasoning_content_in_choices", None
            ),
            api_version=api_version,
            azure_ad_token=kwargs.get("azure_ad_token"),
            tenant_id=kwargs.get("tenant_id"),
            client_id=kwargs.get("client_id"),
            client_secret=kwargs.get("client_secret"),
            azure_username=kwargs.get("azure_username"),
            azure_password=kwargs.get("azure_password"),
            max_retries=max_retries,
            timeout=timeout,
        )
        logging.update_environment_variables(
            model=model,
@ -3351,6 +3359,7 @@ def embedding(  # noqa: PLR0915
                }
            }
        )
    litellm_params_dict = get_litellm_params(**kwargs)
    logging: Logging = litellm_logging_obj  # type: ignore
@ -3412,6 +3421,7 @@ def embedding(  # noqa: PLR0915
                aembedding=aembedding,
                max_retries=max_retries,
                headers=headers or extra_headers,
                litellm_params=litellm_params_dict,
            )
        elif (
            model in litellm.open_ai_embedding_models
@ -4515,6 +4525,8 @@ def image_generation(  # noqa: PLR0915
            **non_default_params,
        )
        litellm_params_dict = get_litellm_params(**kwargs)
        logging: Logging = litellm_logging_obj
        logging.update_environment_variables(
            model=model,
@ -4585,6 +4597,7 @@ def image_generation(  # noqa: PLR0915
                aimg_generation=aimg_generation,
                client=client,
                headers=headers,
                litellm_params=litellm_params_dict,
            )
        elif (
            custom_llm_provider == "openai"
@ -4980,6 +4993,7 @@ def transcription(
        custom_llm_provider=custom_llm_provider,
        drop_params=drop_params,
    )
    litellm_params_dict = get_litellm_params(**kwargs)
    litellm_logging_obj.update_environment_variables(
        model=model,
@ -5033,6 +5047,7 @@ def transcription(
            api_version=api_version,
            azure_ad_token=azure_ad_token,
            max_retries=max_retries,
            litellm_params=litellm_params_dict,
        )
    elif (
        custom_llm_provider == "openai"
@ -5135,7 +5150,7 @@ async def aspeech(*args, **kwargs) -> HttpxBinaryResponseContent:
@client
-def speech(
+def speech(  # noqa: PLR0915
    model: str,
    input: str,
    voice: Optional[Union[str, dict]] = None,
@ -5176,7 +5191,7 @@ def speech(
    if max_retries is None:
        max_retries = litellm.num_retries or openai.DEFAULT_MAX_RETRIES
-
+    litellm_params_dict = get_litellm_params(**kwargs)
    logging_obj = kwargs.get("litellm_logging_obj", None)
    logging_obj.update_environment_variables(
        model=model,
@ -5293,6 +5308,7 @@ def speech(
            timeout=timeout,
            client=client,  # pass AsyncOpenAI, OpenAI client
            aspeech=aspeech,
            litellm_params=litellm_params_dict,
        )
    elif custom_llm_provider == "vertex_ai" or custom_llm_provider == "vertex_ai_beta":
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@ -3,15 +3,15 @@ model_list:
    litellm_params:
      model: bedrock/amazon.nova-canvas-v1:0
      aws_region_name: "us-east-1"
-  - model_name: gpt-4o-mini-3
+      litellm_credential_name: "azure"
-    litellm_params:
+
-      model: azure/gpt-4o-mini-3
+credential_list:
-      api_key: os.environ/AZURE_API_KEY
+  - credential_name: azure
-      api_base: os.environ/AZURE_API_BASE
+    credential_values:
    model_info:
      base_model: azure/eu.gpt-4o-mini-2
  - model_name: gpt-4o-mini-2
    litellm_params:
      model: azure/gpt-4o-mini-2
      api_key: os.environ/AZURE_API_KEY
      api_base: os.environ/AZURE_API_BASE
    credential_info:
      description: "Azure API Key and Base URL"
      type: "azure"
      required: true
      default: "azure"
--- a/litellm/proxy/_types.py
+++ b/litellm/proxy/_types.py
@ -2299,6 +2299,7 @@ class SpecialHeaders(enum.Enum):
    azure_authorization = "API-Key"
    anthropic_authorization = "x-api-key"
    google_ai_studio_authorization = "x-goog-api-key"
    azure_apim_authorization = "Ocp-Apim-Subscription-Key"
 class LitellmDataForBackendLLMCall(TypedDict, total=False):
--- a/litellm/proxy/anthropic_endpoints/endpoints.py
+++ b/litellm/proxy/anthropic_endpoints/endpoints.py
@ -14,6 +14,7 @@ import litellm
 from litellm._logging import verbose_proxy_logger
 from litellm.proxy._types import *
 from litellm.proxy.auth.user_api_key_auth import user_api_key_auth
 from litellm.proxy.common_request_processing import ProxyBaseLLMRequestProcessing
 from litellm.proxy.common_utils.http_parsing_utils import _read_request_body
 from litellm.proxy.litellm_pre_call_utils import add_litellm_data_to_request
 from litellm.proxy.utils import ProxyLogging
@ -89,7 +90,6 @@ async def anthropic_response(  # noqa: PLR0915
    """
    from litellm.proxy.proxy_server import (
        general_settings,
        get_custom_headers,
        llm_router,
        proxy_config,
        proxy_logging_obj,
@ -205,7 +205,7 @@ async def anthropic_response(  # noqa: PLR0915
        verbose_proxy_logger.debug("final response: %s", response)
        fastapi_response.headers.update(
-            get_custom_headers(
+            ProxyBaseLLMRequestProcessing.get_custom_headers(
                user_api_key_dict=user_api_key_dict,
                model_id=model_id,
                cache_key=cache_key,
--- a/litellm/proxy/auth/user_api_key_auth.py
+++ b/litellm/proxy/auth/user_api_key_auth.py
@ -77,6 +77,11 @@ google_ai_studio_api_key_header = APIKeyHeader(
    auto_error=False,
    description="If google ai studio client used.",
 )
 azure_apim_header = APIKeyHeader(
    name=SpecialHeaders.azure_apim_authorization.value,
    auto_error=False,
    description="The default name of the subscription key header of Azure",
 )
 def _get_bearer_token(
@ -301,6 +306,7 @@ async def _user_api_key_auth_builder(  # noqa: PLR0915
    azure_api_key_header: str,
    anthropic_api_key_header: Optional[str],
    google_ai_studio_api_key_header: Optional[str],
    azure_apim_header: Optional[str],
    request_data: dict,
 ) -> UserAPIKeyAuth:
@ -344,6 +350,8 @@ async def _user_api_key_auth_builder(  # noqa: PLR0915
            api_key = anthropic_api_key_header
        elif isinstance(google_ai_studio_api_key_header, str):
            api_key = google_ai_studio_api_key_header
        elif isinstance(azure_apim_header, str):
            api_key = azure_apim_header
        elif pass_through_endpoints is not None:
            for endpoint in pass_through_endpoints:
                if endpoint.get("path", "") == route:
@ -1165,6 +1173,7 @@ async def user_api_key_auth(
    google_ai_studio_api_key_header: Optional[str] = fastapi.Security(
        google_ai_studio_api_key_header
    ),
    azure_apim_header: Optional[str] = fastapi.Security(azure_apim_header),
 ) -> UserAPIKeyAuth:
    """
    Parent function to authenticate user api key / jwt token.
@ -1178,6 +1187,7 @@ async def user_api_key_auth(
        azure_api_key_header=azure_api_key_header,
        anthropic_api_key_header=anthropic_api_key_header,
        google_ai_studio_api_key_header=google_ai_studio_api_key_header,
        azure_apim_header=azure_apim_header,
        request_data=request_data,
    )
--- a/litellm/proxy/batches_endpoints/endpoints.py
+++ b/litellm/proxy/batches_endpoints/endpoints.py
@ -18,6 +18,7 @@ from litellm.batches.main import (
 )
 from litellm.proxy._types import *
 from litellm.proxy.auth.user_api_key_auth import user_api_key_auth
 from litellm.proxy.common_request_processing import ProxyBaseLLMRequestProcessing
 from litellm.proxy.common_utils.http_parsing_utils import _read_request_body
 from litellm.proxy.common_utils.openai_endpoint_utils import (
    get_custom_llm_provider_from_request_body,
@ -69,7 +70,6 @@ async def create_batch(
    from litellm.proxy.proxy_server import (
        add_litellm_data_to_request,
        general_settings,
        get_custom_headers,
        llm_router,
        proxy_config,
        proxy_logging_obj,
@ -137,7 +137,7 @@ async def create_batch(
        api_base = hidden_params.get("api_base", None) or ""
        fastapi_response.headers.update(
-            get_custom_headers(
+            ProxyBaseLLMRequestProcessing.get_custom_headers(
                user_api_key_dict=user_api_key_dict,
                model_id=model_id,
                cache_key=cache_key,
@ -201,7 +201,6 @@ async def retrieve_batch(
    from litellm.proxy.proxy_server import (
        add_litellm_data_to_request,
        general_settings,
        get_custom_headers,
        llm_router,
        proxy_config,
        proxy_logging_obj,
@ -266,7 +265,7 @@ async def retrieve_batch(
        api_base = hidden_params.get("api_base", None) or ""
        fastapi_response.headers.update(
-            get_custom_headers(
+            ProxyBaseLLMRequestProcessing.get_custom_headers(
                user_api_key_dict=user_api_key_dict,
                model_id=model_id,
                cache_key=cache_key,
@ -326,11 +325,7 @@ async def list_batches(
    ```
    """
-    from litellm.proxy.proxy_server import (
+    from litellm.proxy.proxy_server import proxy_logging_obj, version
        get_custom_headers,
        proxy_logging_obj,
        version,
    )
    verbose_proxy_logger.debug("GET /v1/batches after={} limit={}".format(after, limit))
    try:
@ -352,7 +347,7 @@ async def list_batches(
        api_base = hidden_params.get("api_base", None) or ""
        fastapi_response.headers.update(
-            get_custom_headers(
+            ProxyBaseLLMRequestProcessing.get_custom_headers(
                user_api_key_dict=user_api_key_dict,
                model_id=model_id,
                cache_key=cache_key,
@ -417,7 +412,6 @@ async def cancel_batch(
    from litellm.proxy.proxy_server import (
        add_litellm_data_to_request,
        general_settings,
        get_custom_headers,
        proxy_config,
        proxy_logging_obj,
        version,
@ -463,7 +457,7 @@ async def cancel_batch(
        api_base = hidden_params.get("api_base", None) or ""
        fastapi_response.headers.update(
-            get_custom_headers(
+            ProxyBaseLLMRequestProcessing.get_custom_headers(
                user_api_key_dict=user_api_key_dict,
                model_id=model_id,
                cache_key=cache_key,
--- a/litellm/proxy/common_request_processing.py
+++ b/litellm/proxy/common_request_processing.py
@ -0,0 +1,356 @@
 import asyncio
 import json
 import uuid
 from datetime import datetime
 from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Union
 import httpx
 from fastapi import HTTPException, Request, status
 from fastapi.responses import Response, StreamingResponse
 import litellm
 from litellm._logging import verbose_proxy_logger
 from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
 from litellm.proxy._types import ProxyException, UserAPIKeyAuth
 from litellm.proxy.auth.auth_utils import check_response_size_is_safe
 from litellm.proxy.common_utils.callback_utils import (
    get_logging_caching_headers,
    get_remaining_tokens_and_requests_from_request_data,
 )
 from litellm.proxy.route_llm_request import route_request
 from litellm.proxy.utils import ProxyLogging
 from litellm.router import Router
 if TYPE_CHECKING:
    from litellm.proxy.proxy_server import ProxyConfig as _ProxyConfig
    ProxyConfig = _ProxyConfig
 else:
    ProxyConfig = Any
 from litellm.proxy.litellm_pre_call_utils import add_litellm_data_to_request
 class ProxyBaseLLMRequestProcessing:
    def __init__(self, data: dict):
        self.data = data
    @staticmethod
    def get_custom_headers(
        *,
        user_api_key_dict: UserAPIKeyAuth,
        call_id: Optional[str] = None,
        model_id: Optional[str] = None,
        cache_key: Optional[str] = None,
        api_base: Optional[str] = None,
        version: Optional[str] = None,
        model_region: Optional[str] = None,
        response_cost: Optional[Union[float, str]] = None,
        hidden_params: Optional[dict] = None,
        fastest_response_batch_completion: Optional[bool] = None,
        request_data: Optional[dict] = {},
        timeout: Optional[Union[float, int, httpx.Timeout]] = None,
        **kwargs,
    ) -> dict:
        exclude_values = {"", None, "None"}
        hidden_params = hidden_params or {}
        headers = {
            "x-litellm-call-id": call_id,
            "x-litellm-model-id": model_id,
            "x-litellm-cache-key": cache_key,
            "x-litellm-model-api-base": api_base,
            "x-litellm-version": version,
            "x-litellm-model-region": model_region,
            "x-litellm-response-cost": str(response_cost),
            "x-litellm-key-tpm-limit": str(user_api_key_dict.tpm_limit),
            "x-litellm-key-rpm-limit": str(user_api_key_dict.rpm_limit),
            "x-litellm-key-max-budget": str(user_api_key_dict.max_budget),
            "x-litellm-key-spend": str(user_api_key_dict.spend),
            "x-litellm-response-duration-ms": str(
                hidden_params.get("_response_ms", None)
            ),
            "x-litellm-overhead-duration-ms": str(
                hidden_params.get("litellm_overhead_time_ms", None)
            ),
            "x-litellm-fastest_response_batch_completion": (
                str(fastest_response_batch_completion)
                if fastest_response_batch_completion is not None
                else None
            ),
            "x-litellm-timeout": str(timeout) if timeout is not None else None,
            **{k: str(v) for k, v in kwargs.items()},
        }
        if request_data:
            remaining_tokens_header = (
                get_remaining_tokens_and_requests_from_request_data(request_data)
            )
            headers.update(remaining_tokens_header)
            logging_caching_headers = get_logging_caching_headers(request_data)
            if logging_caching_headers:
                headers.update(logging_caching_headers)
        try:
            return {
                key: str(value)
                for key, value in headers.items()
                if value not in exclude_values
            }
        except Exception as e:
            verbose_proxy_logger.error(f"Error setting custom headers: {e}")
            return {}
    async def base_process_llm_request(
        self,
        request: Request,
        fastapi_response: Response,
        user_api_key_dict: UserAPIKeyAuth,
        route_type: Literal["acompletion", "aresponses"],
        proxy_logging_obj: ProxyLogging,
        general_settings: dict,
        proxy_config: ProxyConfig,
        select_data_generator: Callable,
        llm_router: Optional[Router] = None,
        model: Optional[str] = None,
        user_model: Optional[str] = None,
        user_temperature: Optional[float] = None,
        user_request_timeout: Optional[float] = None,
        user_max_tokens: Optional[int] = None,
        user_api_base: Optional[str] = None,
        version: Optional[str] = None,
    ) -> Any:
        """
        Common request processing logic for both chat completions and responses API endpoints
        """
        verbose_proxy_logger.debug(
            "Request received by LiteLLM:\n{}".format(json.dumps(self.data, indent=4)),
        )
        self.data = await add_litellm_data_to_request(
            data=self.data,
            request=request,
            general_settings=general_settings,
            user_api_key_dict=user_api_key_dict,
            version=version,
            proxy_config=proxy_config,
        )
        self.data["model"] = (
            general_settings.get("completion_model", None)  # server default
            or user_model  # model name passed via cli args
            or model  # for azure deployments
            or self.data.get("model", None)  # default passed in http request
        )
        # override with user settings, these are params passed via cli
        if user_temperature:
            self.data["temperature"] = user_temperature
        if user_request_timeout:
            self.data["request_timeout"] = user_request_timeout
        if user_max_tokens:
            self.data["max_tokens"] = user_max_tokens
        if user_api_base:
            self.data["api_base"] = user_api_base
        ### MODEL ALIAS MAPPING ###
        # check if model name in model alias map
        # get the actual model name
        if (
            isinstance(self.data["model"], str)
            and self.data["model"] in litellm.model_alias_map
        ):
            self.data["model"] = litellm.model_alias_map[self.data["model"]]
        ### CALL HOOKS ### - modify/reject incoming data before calling the model
        self.data = await proxy_logging_obj.pre_call_hook(  # type: ignore
            user_api_key_dict=user_api_key_dict, data=self.data, call_type="completion"
        )
        ## LOGGING OBJECT ## - initialize logging object for logging success/failure events for call
        ## IMPORTANT Note: - initialize this before running pre-call checks. Ensures we log rejected requests to langfuse.
        self.data["litellm_call_id"] = request.headers.get(
            "x-litellm-call-id", str(uuid.uuid4())
        )
        logging_obj, self.data = litellm.utils.function_setup(
            original_function=route_type,
            rules_obj=litellm.utils.Rules(),
            start_time=datetime.now(),
            **self.data,
        )
        self.data["litellm_logging_obj"] = logging_obj
        tasks = []
        tasks.append(
            proxy_logging_obj.during_call_hook(
                data=self.data,
                user_api_key_dict=user_api_key_dict,
                call_type=ProxyBaseLLMRequestProcessing._get_pre_call_type(
                    route_type=route_type
                ),
            )
        )
        ### ROUTE THE REQUEST ###
        # Do not change this - it should be a constant time fetch - ALWAYS
        llm_call = await route_request(
            data=self.data,
            route_type=route_type,
            llm_router=llm_router,
            user_model=user_model,
        )
        tasks.append(llm_call)
        # wait for call to end
        llm_responses = asyncio.gather(
            *tasks
        )  # run the moderation check in parallel to the actual llm api call
        responses = await llm_responses
        response = responses[1]
        hidden_params = getattr(response, "_hidden_params", {}) or {}
        model_id = hidden_params.get("model_id", None) or ""
        cache_key = hidden_params.get("cache_key", None) or ""
        api_base = hidden_params.get("api_base", None) or ""
        response_cost = hidden_params.get("response_cost", None) or ""
        fastest_response_batch_completion = hidden_params.get(
            "fastest_response_batch_completion", None
        )
        additional_headers: dict = hidden_params.get("additional_headers", {}) or {}
        # Post Call Processing
        if llm_router is not None:
            self.data["deployment"] = llm_router.get_deployment(model_id=model_id)
        asyncio.create_task(
            proxy_logging_obj.update_request_status(
                litellm_call_id=self.data.get("litellm_call_id", ""), status="success"
            )
        )
        if (
            "stream" in self.data and self.data["stream"] is True
        ):  # use generate_responses to stream responses
            custom_headers = ProxyBaseLLMRequestProcessing.get_custom_headers(
                user_api_key_dict=user_api_key_dict,
                call_id=logging_obj.litellm_call_id,
                model_id=model_id,
                cache_key=cache_key,
                api_base=api_base,
                version=version,
                response_cost=response_cost,
                model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
                fastest_response_batch_completion=fastest_response_batch_completion,
                request_data=self.data,
                hidden_params=hidden_params,
                **additional_headers,
            )
            selected_data_generator = select_data_generator(
                response=response,
                user_api_key_dict=user_api_key_dict,
                request_data=self.data,
            )
            return StreamingResponse(
                selected_data_generator,
                media_type="text/event-stream",
                headers=custom_headers,
            )
        ### CALL HOOKS ### - modify outgoing data
        response = await proxy_logging_obj.post_call_success_hook(
            data=self.data, user_api_key_dict=user_api_key_dict, response=response
        )
        hidden_params = (
            getattr(response, "_hidden_params", {}) or {}
        )  # get any updated response headers
        additional_headers = hidden_params.get("additional_headers", {}) or {}
        fastapi_response.headers.update(
            ProxyBaseLLMRequestProcessing.get_custom_headers(
                user_api_key_dict=user_api_key_dict,
                call_id=logging_obj.litellm_call_id,
                model_id=model_id,
                cache_key=cache_key,
                api_base=api_base,
                version=version,
                response_cost=response_cost,
                model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
                fastest_response_batch_completion=fastest_response_batch_completion,
                request_data=self.data,
                hidden_params=hidden_params,
                **additional_headers,
            )
        )
        await check_response_size_is_safe(response=response)
        return response
    async def _handle_llm_api_exception(
        self,
        e: Exception,
        user_api_key_dict: UserAPIKeyAuth,
        proxy_logging_obj: ProxyLogging,
        version: Optional[str] = None,
    ):
        """Raises ProxyException (OpenAI API compatible) if an exception is raised"""
        verbose_proxy_logger.exception(
            f"litellm.proxy.proxy_server._handle_llm_api_exception(): Exception occured - {str(e)}"
        )
        await proxy_logging_obj.post_call_failure_hook(
            user_api_key_dict=user_api_key_dict,
            original_exception=e,
            request_data=self.data,
        )
        litellm_debug_info = getattr(e, "litellm_debug_info", "")
        verbose_proxy_logger.debug(
            "\033[1;31mAn error occurred: %s %s\n\n Debug this by setting `--debug`, e.g. `litellm --model gpt-3.5-turbo --debug`",
            e,
            litellm_debug_info,
        )
        timeout = getattr(
            e, "timeout", None
        )  # returns the timeout set by the wrapper. Used for testing if model-specific timeout are set correctly
        _litellm_logging_obj: Optional[LiteLLMLoggingObj] = self.data.get(
            "litellm_logging_obj", None
        )
        custom_headers = ProxyBaseLLMRequestProcessing.get_custom_headers(
            user_api_key_dict=user_api_key_dict,
            call_id=(
                _litellm_logging_obj.litellm_call_id if _litellm_logging_obj else None
            ),
            version=version,
            response_cost=0,
            model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
            request_data=self.data,
            timeout=timeout,
        )
        headers = getattr(e, "headers", {}) or {}
        headers.update(custom_headers)
        if isinstance(e, HTTPException):
            raise ProxyException(
                message=getattr(e, "detail", str(e)),
                type=getattr(e, "type", "None"),
                param=getattr(e, "param", "None"),
                code=getattr(e, "status_code", status.HTTP_400_BAD_REQUEST),
                headers=headers,
            )
        error_msg = f"{str(e)}"
        raise ProxyException(
            message=getattr(e, "message", error_msg),
            type=getattr(e, "type", "None"),
            param=getattr(e, "param", "None"),
            openai_code=getattr(e, "code", None),
            code=getattr(e, "status_code", 500),
            headers=headers,
        )
    @staticmethod
    def _get_pre_call_type(
        route_type: Literal["acompletion", "aresponses"]
    ) -> Literal["completion", "responses"]:
        if route_type == "acompletion":
            return "completion"
        elif route_type == "aresponses":
            return "responses"
--- a/litellm/proxy/credential_endpoints/endpoints.py
+++ b/litellm/proxy/credential_endpoints/endpoints.py
@ -0,0 +1,200 @@
 """
 CRUD endpoints for storing reusable credentials.
 """
 from fastapi import APIRouter, Depends, HTTPException, Request, Response
 import litellm
 from litellm._logging import verbose_proxy_logger
 from litellm.litellm_core_utils.credential_accessor import CredentialAccessor
 from litellm.proxy._types import CommonProxyErrors, UserAPIKeyAuth
 from litellm.proxy.auth.user_api_key_auth import user_api_key_auth
 from litellm.proxy.common_utils.encrypt_decrypt_utils import encrypt_value_helper
 from litellm.proxy.utils import handle_exception_on_proxy, jsonify_object
 from litellm.types.utils import CredentialItem
 router = APIRouter()
 class CredentialHelperUtils:
    @staticmethod
    def encrypt_credential_values(credential: CredentialItem) -> CredentialItem:
        """Encrypt values in credential.credential_values and add to DB"""
        encrypted_credential_values = {}
        for key, value in credential.credential_values.items():
            encrypted_credential_values[key] = encrypt_value_helper(value)
        credential.credential_values = encrypted_credential_values
        return credential
@router.post(
    "/credentials",
    dependencies=[Depends(user_api_key_auth)],
    tags=["credential management"],
 )
 async def create_credential(
    request: Request,
    fastapi_response: Response,
    credential: CredentialItem,
    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
 ):
    """
    [BETA] endpoint. This might change unexpectedly.
    Stores credential in DB.
    Reloads credentials in memory.
    """
    from litellm.proxy.proxy_server import prisma_client
    try:
        if prisma_client is None:
            raise HTTPException(
                status_code=500,
                detail={"error": CommonProxyErrors.db_not_connected_error.value},
            )
        encrypted_credential = CredentialHelperUtils.encrypt_credential_values(
            credential
        )
        credentials_dict = encrypted_credential.model_dump()
        credentials_dict_jsonified = jsonify_object(credentials_dict)
        await prisma_client.db.litellm_credentialstable.create(
            data={
                **credentials_dict_jsonified,
                "created_by": user_api_key_dict.user_id,
                "updated_by": user_api_key_dict.user_id,
            }
        )
        ## ADD TO LITELLM ##
        CredentialAccessor.upsert_credentials([credential])
        return {"success": True, "message": "Credential created successfully"}
    except Exception as e:
        verbose_proxy_logger.exception(e)
        raise handle_exception_on_proxy(e)
@router.get(
    "/credentials",
    dependencies=[Depends(user_api_key_auth)],
    tags=["credential management"],
 )
 async def get_credentials(
    request: Request,
    fastapi_response: Response,
    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
 ):
    """
    [BETA] endpoint. This might change unexpectedly.
    """
    try:
        masked_credentials = [
            {
                "credential_name": credential.credential_name,
                "credential_info": credential.credential_info,
            }
            for credential in litellm.credential_list
        ]
        return {"success": True, "credentials": masked_credentials}
    except Exception as e:
        return handle_exception_on_proxy(e)
@router.get(
    "/credentials/{credential_name}",
    dependencies=[Depends(user_api_key_auth)],
    tags=["credential management"],
 )
 async def get_credential(
    request: Request,
    fastapi_response: Response,
    credential_name: str,
    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
 ):
    """
    [BETA] endpoint. This might change unexpectedly.
    """
    try:
        for credential in litellm.credential_list:
            if credential.credential_name == credential_name:
                masked_credential = {
                    "credential_name": credential.credential_name,
                    "credential_values": credential.credential_values,
                }
                return {"success": True, "credential": masked_credential}
        return {"success": False, "message": "Credential not found"}
    except Exception as e:
        return handle_exception_on_proxy(e)
@router.delete(
    "/credentials/{credential_name}",
    dependencies=[Depends(user_api_key_auth)],
    tags=["credential management"],
 )
 async def delete_credential(
    request: Request,
    fastapi_response: Response,
    credential_name: str,
    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
 ):
    """
    [BETA] endpoint. This might change unexpectedly.
    """
    from litellm.proxy.proxy_server import prisma_client
    try:
        if prisma_client is None:
            raise HTTPException(
                status_code=500,
                detail={"error": CommonProxyErrors.db_not_connected_error.value},
            )
        await prisma_client.db.litellm_credentialstable.delete(
            where={"credential_name": credential_name}
        )
        ## DELETE FROM LITELLM ##
        litellm.credential_list = [
            cred
            for cred in litellm.credential_list
            if cred.credential_name != credential_name
        ]
        return {"success": True, "message": "Credential deleted successfully"}
    except Exception as e:
        return handle_exception_on_proxy(e)
@router.put(
    "/credentials/{credential_name}",
    dependencies=[Depends(user_api_key_auth)],
    tags=["credential management"],
 )
 async def update_credential(
    request: Request,
    fastapi_response: Response,
    credential_name: str,
    credential: CredentialItem,
    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
 ):
    """
    [BETA] endpoint. This might change unexpectedly.
    """
    from litellm.proxy.proxy_server import prisma_client
    try:
        if prisma_client is None:
            raise HTTPException(
                status_code=500,
                detail={"error": CommonProxyErrors.db_not_connected_error.value},
            )
        credential_object_jsonified = jsonify_object(credential.model_dump())
        await prisma_client.db.litellm_credentialstable.update(
            where={"credential_name": credential_name},
            data={
                **credential_object_jsonified,
                "updated_by": user_api_key_dict.user_id,
            },
        )
        return {"success": True, "message": "Credential updated successfully"}
    except Exception as e:
        return handle_exception_on_proxy(e)
--- a/litellm/proxy/example_config_yaml/custom_callbacks1.py
+++ b/litellm/proxy/example_config_yaml/custom_callbacks1.py
@ -61,6 +61,7 @@ class MyCustomHandler(
            "image_generation",
            "moderation",
            "audio_transcription",
            "responses",
        ],
    ):
        pass
--- a/litellm/proxy/example_config_yaml/custom_guardrail.py
+++ b/litellm/proxy/example_config_yaml/custom_guardrail.py
@ -66,6 +66,7 @@ class myCustomGuardrail(CustomGuardrail):
            "image_generation",
            "moderation",
            "audio_transcription",
            "responses",
        ],
    ):
        """
--- a/litellm/proxy/fine_tuning_endpoints/endpoints.py
+++ b/litellm/proxy/fine_tuning_endpoints/endpoints.py
@ -15,6 +15,7 @@ import litellm
 from litellm._logging import verbose_proxy_logger
 from litellm.proxy._types import *
 from litellm.proxy.auth.user_api_key_auth import user_api_key_auth
 from litellm.proxy.common_request_processing import ProxyBaseLLMRequestProcessing
 from litellm.proxy.utils import handle_exception_on_proxy
 router = APIRouter()
@ -97,7 +98,6 @@ async def create_fine_tuning_job(
    from litellm.proxy.proxy_server import (
        add_litellm_data_to_request,
        general_settings,
        get_custom_headers,
        premium_user,
        proxy_config,
        proxy_logging_obj,
@ -151,7 +151,7 @@ async def create_fine_tuning_job(
        api_base = hidden_params.get("api_base", None) or ""
        fastapi_response.headers.update(
-            get_custom_headers(
+            ProxyBaseLLMRequestProcessing.get_custom_headers(
                user_api_key_dict=user_api_key_dict,
                model_id=model_id,
                cache_key=cache_key,
@ -205,7 +205,6 @@ async def retrieve_fine_tuning_job(
    from litellm.proxy.proxy_server import (
        add_litellm_data_to_request,
        general_settings,
        get_custom_headers,
        premium_user,
        proxy_config,
        proxy_logging_obj,
@ -248,7 +247,7 @@ async def retrieve_fine_tuning_job(
        api_base = hidden_params.get("api_base", None) or ""
        fastapi_response.headers.update(
-            get_custom_headers(
+            ProxyBaseLLMRequestProcessing.get_custom_headers(
                user_api_key_dict=user_api_key_dict,
                model_id=model_id,
                cache_key=cache_key,
@ -305,7 +304,6 @@ async def list_fine_tuning_jobs(
    from litellm.proxy.proxy_server import (
        add_litellm_data_to_request,
        general_settings,
        get_custom_headers,
        premium_user,
        proxy_config,
        proxy_logging_obj,
@ -349,7 +347,7 @@ async def list_fine_tuning_jobs(
        api_base = hidden_params.get("api_base", None) or ""
        fastapi_response.headers.update(
-            get_custom_headers(
+            ProxyBaseLLMRequestProcessing.get_custom_headers(
                user_api_key_dict=user_api_key_dict,
                model_id=model_id,
                cache_key=cache_key,
@ -404,7 +402,6 @@ async def cancel_fine_tuning_job(
    from litellm.proxy.proxy_server import (
        add_litellm_data_to_request,
        general_settings,
        get_custom_headers,
        premium_user,
        proxy_config,
        proxy_logging_obj,
@ -451,7 +448,7 @@ async def cancel_fine_tuning_job(
        api_base = hidden_params.get("api_base", None) or ""
        fastapi_response.headers.update(
-            get_custom_headers(
+            ProxyBaseLLMRequestProcessing.get_custom_headers(
                user_api_key_dict=user_api_key_dict,
                model_id=model_id,
                cache_key=cache_key,
--- a/litellm/proxy/guardrails/guardrail_hooks/aim.py
+++ b/litellm/proxy/guardrails/guardrail_hooks/aim.py
@ -25,8 +25,12 @@ class AimGuardrailMissingSecrets(Exception):
 class AimGuardrail(CustomGuardrail):
-    def __init__(self, api_key: Optional[str] = None, api_base: Optional[str] = None, **kwargs):
+    def __init__(
-        self.async_handler = get_async_httpx_client(llm_provider=httpxSpecialProvider.GuardrailCallback)
+        self, api_key: Optional[str] = None, api_base: Optional[str] = None, **kwargs
    ):
        self.async_handler = get_async_httpx_client(
            llm_provider=httpxSpecialProvider.GuardrailCallback
        )
        self.api_key = api_key or os.environ.get("AIM_API_KEY")
        if not self.api_key:
            msg = (
@ -34,7 +38,9 @@ class AimGuardrail(CustomGuardrail):
                "pass it as a parameter to the guardrail in the config file"
            )
            raise AimGuardrailMissingSecrets(msg)
-        self.api_base = api_base or os.environ.get("AIM_API_BASE") or "https://api.aim.security"
+        self.api_base = (
            api_base or os.environ.get("AIM_API_BASE") or "https://api.aim.security"
        )
        super().__init__(**kwargs)
    async def async_pre_call_hook(
@ -68,6 +74,7 @@ class AimGuardrail(CustomGuardrail):
            "image_generation",
            "moderation",
            "audio_transcription",
            "responses",
        ],
    ) -> Union[Exception, str, dict, None]:
        verbose_proxy_logger.debug("Inside AIM Moderation Hook")
@ -77,9 +84,10 @@ class AimGuardrail(CustomGuardrail):
    async def call_aim_guardrail(self, data: dict, hook: str) -> None:
        user_email = data.get("metadata", {}).get("headers", {}).get("x-aim-user-email")
-        headers = {"Authorization": f"Bearer {self.api_key}", "x-aim-litellm-hook": hook} | (
+        headers = {
-            {"x-aim-user-email": user_email} if user_email else {}
+            "Authorization": f"Bearer {self.api_key}",
-        )
+            "x-aim-litellm-hook": hook,
        } | ({"x-aim-user-email": user_email} if user_email else {})
        response = await self.async_handler.post(
            f"{self.api_base}/detect/openai",
            headers=headers,
--- a/litellm/proxy/guardrails/guardrail_hooks/aporia_ai.py
+++ b/litellm/proxy/guardrails/guardrail_hooks/aporia_ai.py
@ -178,7 +178,7 @@ class AporiaGuardrail(CustomGuardrail):
        pass
    @log_guardrail_information
-    async def async_moderation_hook(  ### 👈 KEY CHANGE ###
+    async def async_moderation_hook(
        self,
        data: dict,
        user_api_key_dict: UserAPIKeyAuth,
@ -188,6 +188,7 @@ class AporiaGuardrail(CustomGuardrail):
            "image_generation",
            "moderation",
            "audio_transcription",
            "responses",
        ],
    ):
        from litellm.proxy.common_utils.callback_utils import (
--- a/litellm/proxy/guardrails/guardrail_hooks/bedrock_guardrails.py
+++ b/litellm/proxy/guardrails/guardrail_hooks/bedrock_guardrails.py
@ -240,7 +240,7 @@ class BedrockGuardrail(CustomGuardrail, BaseAWSLLM):
            )
    @log_guardrail_information
-    async def async_moderation_hook(  ### 👈 KEY CHANGE ###
+    async def async_moderation_hook(
        self,
        data: dict,
        user_api_key_dict: UserAPIKeyAuth,
@ -250,6 +250,7 @@ class BedrockGuardrail(CustomGuardrail, BaseAWSLLM):
            "image_generation",
            "moderation",
            "audio_transcription",
            "responses",
        ],
    ):
        from litellm.proxy.common_utils.callback_utils import (
--- a/litellm/proxy/guardrails/guardrail_hooks/custom_guardrail.py
+++ b/litellm/proxy/guardrails/guardrail_hooks/custom_guardrail.py
@ -70,6 +70,7 @@ class myCustomGuardrail(CustomGuardrail):
            "image_generation",
            "moderation",
            "audio_transcription",
            "responses",
        ],
    ):
        """
--- a/litellm/proxy/guardrails/guardrail_hooks/lakera_ai.py
+++ b/litellm/proxy/guardrails/guardrail_hooks/lakera_ai.py
@ -134,6 +134,7 @@ class lakeraAI_Moderation(CustomGuardrail):
            "audio_transcription",
            "pass_through_endpoint",
            "rerank",
            "responses",
        ],
    ):
        if (
@ -335,7 +336,7 @@ class lakeraAI_Moderation(CustomGuardrail):
        )
    @log_guardrail_information
-    async def async_moderation_hook(  ### 👈 KEY CHANGE ###
+    async def async_moderation_hook(
        self,
        data: dict,
        user_api_key_dict: UserAPIKeyAuth,
@ -345,6 +346,7 @@ class lakeraAI_Moderation(CustomGuardrail):
            "image_generation",
            "moderation",
            "audio_transcription",
            "responses",
        ],
    ):
        if self.event_hook is None:
--- a/litellm/proxy/litellm_pre_call_utils.py
+++ b/litellm/proxy/litellm_pre_call_utils.py
@ -62,10 +62,18 @@ def _get_metadata_variable_name(request: Request) -> str:
    """
    if RouteChecks._is_assistants_api_request(request):
        return "litellm_metadata"
-    if "batches" in request.url.path:
+
-        return "litellm_metadata"
+    LITELLM_METADATA_ROUTES = [
-    if "/v1/messages" in request.url.path:
+        "batches",
-        # anthropic API has a field called metadata
+        "/v1/messages",
        "responses",
    ]
    if any(
        [
            litellm_metadata_route in request.url.path
            for litellm_metadata_route in LITELLM_METADATA_ROUTES
        ]
    ):
        return "litellm_metadata"
    else:
        return "metadata"
--- a/litellm/proxy/openai_files_endpoints/files_endpoints.py
+++ b/litellm/proxy/openai_files_endpoints/files_endpoints.py
@ -27,6 +27,7 @@ from litellm import CreateFileRequest, get_secret_str
 from litellm._logging import verbose_proxy_logger
 from litellm.proxy._types import *
 from litellm.proxy.auth.user_api_key_auth import user_api_key_auth
 from litellm.proxy.common_request_processing import ProxyBaseLLMRequestProcessing
 from litellm.proxy.common_utils.openai_endpoint_utils import (
    get_custom_llm_provider_from_request_body,
 )
@ -145,7 +146,6 @@ async def create_file(
    from litellm.proxy.proxy_server import (
        add_litellm_data_to_request,
        general_settings,
        get_custom_headers,
        llm_router,
        proxy_config,
        proxy_logging_obj,
@ -234,7 +234,7 @@ async def create_file(
        api_base = hidden_params.get("api_base", None) or ""
        fastapi_response.headers.update(
-            get_custom_headers(
+            ProxyBaseLLMRequestProcessing.get_custom_headers(
                user_api_key_dict=user_api_key_dict,
                model_id=model_id,
                cache_key=cache_key,
@ -309,7 +309,6 @@ async def get_file_content(
    from litellm.proxy.proxy_server import (
        add_litellm_data_to_request,
        general_settings,
        get_custom_headers,
        proxy_config,
        proxy_logging_obj,
        version,
@ -351,7 +350,7 @@ async def get_file_content(
        api_base = hidden_params.get("api_base", None) or ""
        fastapi_response.headers.update(
-            get_custom_headers(
+            ProxyBaseLLMRequestProcessing.get_custom_headers(
                user_api_key_dict=user_api_key_dict,
                model_id=model_id,
                cache_key=cache_key,
@ -437,7 +436,6 @@ async def get_file(
    from litellm.proxy.proxy_server import (
        add_litellm_data_to_request,
        general_settings,
        get_custom_headers,
        proxy_config,
        proxy_logging_obj,
        version,
@ -477,7 +475,7 @@ async def get_file(
        api_base = hidden_params.get("api_base", None) or ""
        fastapi_response.headers.update(
-            get_custom_headers(
+            ProxyBaseLLMRequestProcessing.get_custom_headers(
                user_api_key_dict=user_api_key_dict,
                model_id=model_id,
                cache_key=cache_key,
@ -554,7 +552,6 @@ async def delete_file(
    from litellm.proxy.proxy_server import (
        add_litellm_data_to_request,
        general_settings,
        get_custom_headers,
        proxy_config,
        proxy_logging_obj,
        version,
@ -595,7 +592,7 @@ async def delete_file(
        api_base = hidden_params.get("api_base", None) or ""
        fastapi_response.headers.update(
-            get_custom_headers(
+            ProxyBaseLLMRequestProcessing.get_custom_headers(
                user_api_key_dict=user_api_key_dict,
                model_id=model_id,
                cache_key=cache_key,
@ -671,7 +668,6 @@ async def list_files(
    from litellm.proxy.proxy_server import (
        add_litellm_data_to_request,
        general_settings,
        get_custom_headers,
        proxy_config,
        proxy_logging_obj,
        version,
@ -712,7 +708,7 @@ async def list_files(
        api_base = hidden_params.get("api_base", None) or ""
        fastapi_response.headers.update(
-            get_custom_headers(
+            ProxyBaseLLMRequestProcessing.get_custom_headers(
                user_api_key_dict=user_api_key_dict,
                model_id=model_id,
                cache_key=cache_key,
--- a/litellm/proxy/pass_through_endpoints/pass_through_endpoints.py
+++ b/litellm/proxy/pass_through_endpoints/pass_through_endpoints.py
@ -3,8 +3,8 @@ import asyncio
 import json
 from base64 import b64encode
 from datetime import datetime
-from typing import List, Optional
+from typing import Dict, List, Optional, Union
-from urllib.parse import urlparse
+from urllib.parse import parse_qs, urlencode, urlparse
 import httpx
 from fastapi import APIRouter, Depends, HTTPException, Request, Response, status
@ -23,6 +23,7 @@ from litellm.proxy._types import (
    UserAPIKeyAuth,
 )
 from litellm.proxy.auth.user_api_key_auth import user_api_key_auth
 from litellm.proxy.common_request_processing import ProxyBaseLLMRequestProcessing
 from litellm.proxy.common_utils.http_parsing_utils import _read_request_body
 from litellm.secret_managers.main import get_secret_str
 from litellm.types.llms.custom_http import httpxSpecialProvider
@ -106,7 +107,6 @@ async def chat_completion_pass_through_endpoint(  # noqa: PLR0915
    from litellm.proxy.proxy_server import (
        add_litellm_data_to_request,
        general_settings,
        get_custom_headers,
        llm_router,
        proxy_config,
        proxy_logging_obj,
@ -231,7 +231,7 @@ async def chat_completion_pass_through_endpoint(  # noqa: PLR0915
        verbose_proxy_logger.debug("final response: %s", response)
        fastapi_response.headers.update(
-            get_custom_headers(
+            ProxyBaseLLMRequestProcessing.get_custom_headers(
                user_api_key_dict=user_api_key_dict,
                model_id=model_id,
                cache_key=cache_key,
@ -307,6 +307,21 @@ class HttpPassThroughEndpointHelpers:
            return EndpointType.ANTHROPIC
        return EndpointType.GENERIC
    @staticmethod
    def get_merged_query_parameters(
        existing_url: httpx.URL, request_query_params: Dict[str, Union[str, list]]
    ) -> Dict[str, Union[str, List[str]]]:
        # Get the existing query params from the target URL
        existing_query_string = existing_url.query.decode("utf-8")
        existing_query_params = parse_qs(existing_query_string)
        # parse_qs returns a dict where each value is a list, so let's flatten it
        updated_existing_query_params = {
            k: v[0] if len(v) == 1 else v for k, v in existing_query_params.items()
        }
        # Merge the query params, giving priority to the existing ones
        return {**request_query_params, **updated_existing_query_params}
    @staticmethod
    async def _make_non_streaming_http_request(
        request: Request,
@ -346,6 +361,7 @@ async def pass_through_request(  # noqa: PLR0915
    user_api_key_dict: UserAPIKeyAuth,
    custom_body: Optional[dict] = None,
    forward_headers: Optional[bool] = False,
    merge_query_params: Optional[bool] = False,
    query_params: Optional[dict] = None,
    stream: Optional[bool] = None,
 ):
@ -361,6 +377,18 @@ async def pass_through_request(  # noqa: PLR0915
            request=request, headers=headers, forward_headers=forward_headers
        )
        if merge_query_params:
            # Create a new URL with the merged query params
            url = url.copy_with(
                query=urlencode(
                    HttpPassThroughEndpointHelpers.get_merged_query_parameters(
                        existing_url=url,
                        request_query_params=dict(request.query_params),
                    )
                ).encode("ascii")
            )
        endpoint_type: EndpointType = HttpPassThroughEndpointHelpers.get_endpoint_type(
            str(url)
        )
@ -657,6 +685,7 @@ def create_pass_through_route(
    target: str,
    custom_headers: Optional[dict] = None,
    _forward_headers: Optional[bool] = False,
    _merge_query_params: Optional[bool] = False,
    dependencies: Optional[List] = None,
 ):
    # check if target is an adapter.py or a url
@ -703,6 +732,7 @@ def create_pass_through_route(
                custom_headers=custom_headers or {},
                user_api_key_dict=user_api_key_dict,
                forward_headers=_forward_headers,
                merge_query_params=_merge_query_params,
                query_params=query_params,
                stream=stream,
                custom_body=custom_body,
@ -732,6 +762,7 @@ async def initialize_pass_through_endpoints(pass_through_endpoints: list):
            custom_headers=_custom_headers
        )
        _forward_headers = endpoint.get("forward_headers", None)
        _merge_query_params = endpoint.get("merge_query_params", None)
        _auth = endpoint.get("auth", None)
        _dependencies = None
        if _auth is not None and str(_auth).lower() == "true":
@ -753,7 +784,12 @@ async def initialize_pass_through_endpoints(pass_through_endpoints: list):
        app.add_api_route(  # type: ignore
            path=_path,
            endpoint=create_pass_through_route(  # type: ignore
-                _path, _target, _custom_headers, _forward_headers, _dependencies
+                _path,
                _target,
                _custom_headers,
                _forward_headers,
                _merge_query_params,
                _dependencies,
            ),
            methods=["GET", "POST", "PUT", "DELETE", "PATCH"],
            dependencies=_dependencies,
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@ -1,10 +1,6 @@
 model_list:
-  - model_name: thinking-us.anthropic.claude-3-7-sonnet-20250219-v1:0
+  - model_name: gpt-4o
    litellm_params:
-      model: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0
+      model: gpt-4o
      thinking: {"type": "enabled", "budget_tokens": 1024}
      max_tokens: 1080
      merge_reasoning_content_in_choices: true
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -114,6 +114,7 @@ from litellm.litellm_core_utils.core_helpers import (
    _get_parent_otel_span_from_kwargs,
    get_litellm_metadata_from_kwargs,
 )
 from litellm.litellm_core_utils.credential_accessor import CredentialAccessor
 from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.proxy._types import *
@ -138,12 +139,9 @@ from litellm.proxy.batches_endpoints.endpoints import router as batches_router
 ## Import All Misc routes here ##
 from litellm.proxy.caching_routes import router as caching_router
 from litellm.proxy.common_request_processing import ProxyBaseLLMRequestProcessing
 from litellm.proxy.common_utils.admin_ui_utils import html_form
-from litellm.proxy.common_utils.callback_utils import (
+from litellm.proxy.common_utils.callback_utils import initialize_callbacks_on_proxy
    get_logging_caching_headers,
    get_remaining_tokens_and_requests_from_request_data,
    initialize_callbacks_on_proxy,
 )
 from litellm.proxy.common_utils.debug_utils import init_verbose_loggers
 from litellm.proxy.common_utils.debug_utils import router as debugging_endpoints_router
 from litellm.proxy.common_utils.encrypt_decrypt_utils import (
@ -164,6 +162,7 @@ from litellm.proxy.common_utils.openai_endpoint_utils import (
 from litellm.proxy.common_utils.proxy_state import ProxyState
 from litellm.proxy.common_utils.reset_budget_job import ResetBudgetJob
 from litellm.proxy.common_utils.swagger_utils import ERROR_RESPONSES
 from litellm.proxy.credential_endpoints.endpoints import router as credential_router
 from litellm.proxy.fine_tuning_endpoints.endpoints import router as fine_tuning_router
 from litellm.proxy.fine_tuning_endpoints.endpoints import set_fine_tuning_config
 from litellm.proxy.guardrails.guardrail_endpoints import router as guardrails_router
@ -234,6 +233,7 @@ from litellm.proxy.pass_through_endpoints.pass_through_endpoints import (
    router as pass_through_router,
 )
 from litellm.proxy.rerank_endpoints.endpoints import router as rerank_router
 from litellm.proxy.response_api_endpoints.endpoints import router as response_router
 from litellm.proxy.route_llm_request import route_request
 from litellm.proxy.spend_tracking.spend_management_endpoints import (
    router as spend_management_router,
@ -287,7 +287,7 @@ from litellm.types.llms.openai import HttpxBinaryResponseContent
 from litellm.types.router import DeploymentTypedDict
 from litellm.types.router import ModelInfo as RouterModelInfo
 from litellm.types.router import RouterGeneralSettings, updateDeployment
-from litellm.types.utils import CustomHuggingfaceTokenizer
+from litellm.types.utils import CredentialItem, CustomHuggingfaceTokenizer
 from litellm.types.utils import ModelInfo as ModelMapInfo
 from litellm.types.utils import RawRequestTypedDict, StandardLoggingPayload
 from litellm.utils import _add_custom_logger_callback_to_specific_event
@ -781,69 +781,6 @@ db_writer_client: Optional[AsyncHTTPHandler] = None
 ### logger ###
 def get_custom_headers(
    *,
    user_api_key_dict: UserAPIKeyAuth,
    call_id: Optional[str] = None,
    model_id: Optional[str] = None,
    cache_key: Optional[str] = None,
    api_base: Optional[str] = None,
    version: Optional[str] = None,
    model_region: Optional[str] = None,
    response_cost: Optional[Union[float, str]] = None,
    hidden_params: Optional[dict] = None,
    fastest_response_batch_completion: Optional[bool] = None,
    request_data: Optional[dict] = {},
    timeout: Optional[Union[float, int, httpx.Timeout]] = None,
    **kwargs,
 ) -> dict:
    exclude_values = {"", None, "None"}
    hidden_params = hidden_params or {}
    headers = {
        "x-litellm-call-id": call_id,
        "x-litellm-model-id": model_id,
        "x-litellm-cache-key": cache_key,
        "x-litellm-model-api-base": api_base,
        "x-litellm-version": version,
        "x-litellm-model-region": model_region,
        "x-litellm-response-cost": str(response_cost),
        "x-litellm-key-tpm-limit": str(user_api_key_dict.tpm_limit),
        "x-litellm-key-rpm-limit": str(user_api_key_dict.rpm_limit),
        "x-litellm-key-max-budget": str(user_api_key_dict.max_budget),
        "x-litellm-key-spend": str(user_api_key_dict.spend),
        "x-litellm-response-duration-ms": str(hidden_params.get("_response_ms", None)),
        "x-litellm-overhead-duration-ms": str(
            hidden_params.get("litellm_overhead_time_ms", None)
        ),
        "x-litellm-fastest_response_batch_completion": (
            str(fastest_response_batch_completion)
            if fastest_response_batch_completion is not None
            else None
        ),
        "x-litellm-timeout": str(timeout) if timeout is not None else None,
        **{k: str(v) for k, v in kwargs.items()},
    }
    if request_data:
        remaining_tokens_header = get_remaining_tokens_and_requests_from_request_data(
            request_data
        )
        headers.update(remaining_tokens_header)
        logging_caching_headers = get_logging_caching_headers(request_data)
        if logging_caching_headers:
            headers.update(logging_caching_headers)
    try:
        return {
            key: str(value)
            for key, value in headers.items()
            if value not in exclude_values
        }
    except Exception as e:
        verbose_proxy_logger.error(f"Error setting custom headers: {e}")
        return {}
 async def check_request_disconnection(request: Request, llm_api_call_task):
    """
    Asynchronously checks if the request is disconnected at regular intervals.
@ -1723,6 +1660,16 @@ class ProxyConfig:
            )
            return {}
    def load_credential_list(self, config: dict) -> List[CredentialItem]:
        """
        Load the credential list from the database
        """
        credential_list_dict = config.get("credential_list")
        credential_list = []
        if credential_list_dict:
            credential_list = [CredentialItem(**cred) for cred in credential_list_dict]
        return credential_list
    async def load_config(  # noqa: PLR0915
        self, router: Optional[litellm.Router], config_file_path: str
    ):
@ -2186,6 +2133,10 @@ class ProxyConfig:
            init_guardrails_v2(
                all_guardrails=guardrails_v2, config_file_path=config_file_path
            )
        ## CREDENTIALS
        credential_list_dict = self.load_credential_list(config=config)
        litellm.credential_list = credential_list_dict
        return router, router.get_model_list(), general_settings
    def _load_alerting_settings(self, general_settings: dict):
@ -2832,6 +2783,60 @@ class ProxyConfig:
                )
            )
    def decrypt_credentials(self, credential: Union[dict, BaseModel]) -> CredentialItem:
        if isinstance(credential, dict):
            credential_object = CredentialItem(**credential)
        elif isinstance(credential, BaseModel):
            credential_object = CredentialItem(**credential.model_dump())
        decrypted_credential_values = {}
        for k, v in credential_object.credential_values.items():
            decrypted_credential_values[k] = decrypt_value_helper(v) or v
        credential_object.credential_values = decrypted_credential_values
        return credential_object
    async def delete_credentials(self, db_credentials: List[CredentialItem]):
        """
        Create all-up list of db credentials + local credentials
        Compare to the litellm.credential_list
        Delete any from litellm.credential_list that are not in the all-up list
        """
        ## CONFIG credentials ##
        config = await self.get_config(config_file_path=user_config_file_path)
        credential_list = self.load_credential_list(config=config)
        ## COMBINED LIST ##
        combined_list = db_credentials + credential_list
        ## DELETE ##
        idx_to_delete = []
        for idx, credential in enumerate(litellm.credential_list):
            if credential.credential_name not in [
                cred.credential_name for cred in combined_list
            ]:
                idx_to_delete.append(idx)
        for idx in sorted(idx_to_delete, reverse=True):
            litellm.credential_list.pop(idx)
    async def get_credentials(self, prisma_client: PrismaClient):
        try:
            credentials = await prisma_client.db.litellm_credentialstable.find_many()
            credentials = [self.decrypt_credentials(cred) for cred in credentials]
            await self.delete_credentials(
                credentials
            )  # delete credentials that are not in the all-up list
            CredentialAccessor.upsert_credentials(
                credentials
            )  # upsert credentials that are in the all-up list
        except Exception as e:
            verbose_proxy_logger.exception(
                "litellm.proxy_server.py::get_credentials() - Error getting credentials from DB - {}".format(
                    str(e)
                )
            )
            return []
 proxy_config = ProxyConfig()
@ -3253,6 +3258,14 @@ class ProxyStartupEvent:
                prisma_client=prisma_client, proxy_logging_obj=proxy_logging_obj
            )
        ### GET STORED CREDENTIALS ###
        scheduler.add_job(
            proxy_config.get_credentials,
            "interval",
            seconds=10,
            args=[prisma_client],
        )
        await proxy_config.get_credentials(prisma_client=prisma_client)
        if (
            proxy_logging_obj is not None
            and proxy_logging_obj.slack_alerting_instance.alerting is not None
@ -3475,169 +3488,28 @@ async def chat_completion(  # noqa: PLR0915
    """
    global general_settings, user_debug, proxy_logging_obj, llm_model_list
    data = {}
    try:
        data = await _read_request_body(request=request)
        verbose_proxy_logger.debug(
            "Request received by LiteLLM:\n{}".format(json.dumps(data, indent=4)),
        )
        data = await add_litellm_data_to_request(
            data=data,
            request=request,
            general_settings=general_settings,
            user_api_key_dict=user_api_key_dict,
            version=version,
            proxy_config=proxy_config,
        )
        data["model"] = (
            general_settings.get("completion_model", None)  # server default
            or user_model  # model name passed via cli args
            or model  # for azure deployments
            or data.get("model", None)  # default passed in http request
        )
    global user_temperature, user_request_timeout, user_max_tokens, user_api_base
-        # override with user settings, these are params passed via cli
+    data = await _read_request_body(request=request)
-        if user_temperature:
+    base_llm_response_processor = ProxyBaseLLMRequestProcessing(data=data)
-            data["temperature"] = user_temperature
+    try:
-        if user_request_timeout:
+        return await base_llm_response_processor.base_process_llm_request(
-            data["request_timeout"] = user_request_timeout
+            request=request,
-        if user_max_tokens:
+            fastapi_response=fastapi_response,
            data["max_tokens"] = user_max_tokens
        if user_api_base:
            data["api_base"] = user_api_base
        ### MODEL ALIAS MAPPING ###
        # check if model name in model alias map
        # get the actual model name
        if isinstance(data["model"], str) and data["model"] in litellm.model_alias_map:
            data["model"] = litellm.model_alias_map[data["model"]]
        ### CALL HOOKS ### - modify/reject incoming data before calling the model
        data = await proxy_logging_obj.pre_call_hook(  # type: ignore
            user_api_key_dict=user_api_key_dict, data=data, call_type="completion"
        )
        ## LOGGING OBJECT ## - initialize logging object for logging success/failure events for call
        ## IMPORTANT Note: - initialize this before running pre-call checks. Ensures we log rejected requests to langfuse.
        data["litellm_call_id"] = request.headers.get(
            "x-litellm-call-id", str(uuid.uuid4())
        )
        logging_obj, data = litellm.utils.function_setup(
            original_function="acompletion",
            rules_obj=litellm.utils.Rules(),
            start_time=datetime.now(),
            **data,
        )
        data["litellm_logging_obj"] = logging_obj
        tasks = []
        tasks.append(
            proxy_logging_obj.during_call_hook(
                data=data,
            user_api_key_dict=user_api_key_dict,
                call_type="completion",
            )
        )
        ### ROUTE THE REQUEST ###
        # Do not change this - it should be a constant time fetch - ALWAYS
        llm_call = await route_request(
            data=data,
            route_type="acompletion",
            proxy_logging_obj=proxy_logging_obj,
            llm_router=llm_router,
            general_settings=general_settings,
            proxy_config=proxy_config,
            select_data_generator=select_data_generator,
            model=model,
            user_model=user_model,
-        )
+            user_temperature=user_temperature,
-        tasks.append(llm_call)
+            user_request_timeout=user_request_timeout,
-
+            user_max_tokens=user_max_tokens,
-        # wait for call to end
+            user_api_base=user_api_base,
        llm_responses = asyncio.gather(
            *tasks
        )  # run the moderation check in parallel to the actual llm api call
        responses = await llm_responses
        response = responses[1]
        hidden_params = getattr(response, "_hidden_params", {}) or {}
        model_id = hidden_params.get("model_id", None) or ""
        cache_key = hidden_params.get("cache_key", None) or ""
        api_base = hidden_params.get("api_base", None) or ""
        response_cost = hidden_params.get("response_cost", None) or ""
        fastest_response_batch_completion = hidden_params.get(
            "fastest_response_batch_completion", None
        )
        additional_headers: dict = hidden_params.get("additional_headers", {}) or {}
        # Post Call Processing
        if llm_router is not None:
            data["deployment"] = llm_router.get_deployment(model_id=model_id)
        asyncio.create_task(
            proxy_logging_obj.update_request_status(
                litellm_call_id=data.get("litellm_call_id", ""), status="success"
            )
        )
        if (
            "stream" in data and data["stream"] is True
        ):  # use generate_responses to stream responses
            custom_headers = get_custom_headers(
                user_api_key_dict=user_api_key_dict,
                call_id=logging_obj.litellm_call_id,
                model_id=model_id,
                cache_key=cache_key,
                api_base=api_base,
            version=version,
                response_cost=response_cost,
                model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
                fastest_response_batch_completion=fastest_response_batch_completion,
                request_data=data,
                hidden_params=hidden_params,
                **additional_headers,
        )
            selected_data_generator = select_data_generator(
                response=response,
                user_api_key_dict=user_api_key_dict,
                request_data=data,
            )
            return StreamingResponse(
                selected_data_generator,
                media_type="text/event-stream",
                headers=custom_headers,
            )
        ### CALL HOOKS ### - modify outgoing data
        response = await proxy_logging_obj.post_call_success_hook(
            data=data, user_api_key_dict=user_api_key_dict, response=response
        )
        hidden_params = (
            getattr(response, "_hidden_params", {}) or {}
        )  # get any updated response headers
        additional_headers = hidden_params.get("additional_headers", {}) or {}
        fastapi_response.headers.update(
            get_custom_headers(
                user_api_key_dict=user_api_key_dict,
                call_id=logging_obj.litellm_call_id,
                model_id=model_id,
                cache_key=cache_key,
                api_base=api_base,
                version=version,
                response_cost=response_cost,
                model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
                fastest_response_batch_completion=fastest_response_batch_completion,
                request_data=data,
                hidden_params=hidden_params,
                **additional_headers,
            )
        )
        await check_response_size_is_safe(response=response)
        return response
    except RejectedRequestError as e:
        _data = e.request_data
        await proxy_logging_obj.post_call_failure_hook(
@ -3672,55 +3544,10 @@ async def chat_completion(  # noqa: PLR0915
        _chat_response.usage = _usage  # type: ignore
        return _chat_response
    except Exception as e:
-        verbose_proxy_logger.exception(
+        raise await base_llm_response_processor._handle_llm_api_exception(
-            f"litellm.proxy.proxy_server.chat_completion(): Exception occured - {str(e)}"
+            e=e,
        )
        await proxy_logging_obj.post_call_failure_hook(
            user_api_key_dict=user_api_key_dict, original_exception=e, request_data=data
        )
        litellm_debug_info = getattr(e, "litellm_debug_info", "")
        verbose_proxy_logger.debug(
            "\033[1;31mAn error occurred: %s %s\n\n Debug this by setting `--debug`, e.g. `litellm --model gpt-3.5-turbo --debug`",
            e,
            litellm_debug_info,
        )
        timeout = getattr(
            e, "timeout", None
        )  # returns the timeout set by the wrapper. Used for testing if model-specific timeout are set correctly
        _litellm_logging_obj: Optional[LiteLLMLoggingObj] = data.get(
            "litellm_logging_obj", None
        )
        custom_headers = get_custom_headers(
            user_api_key_dict=user_api_key_dict,
-            call_id=(
+            proxy_logging_obj=proxy_logging_obj,
                _litellm_logging_obj.litellm_call_id if _litellm_logging_obj else None
            ),
            version=version,
            response_cost=0,
            model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
            request_data=data,
            timeout=timeout,
        )
        headers = getattr(e, "headers", {}) or {}
        headers.update(custom_headers)
        if isinstance(e, HTTPException):
            raise ProxyException(
                message=getattr(e, "detail", str(e)),
                type=getattr(e, "type", "None"),
                param=getattr(e, "param", "None"),
                code=getattr(e, "status_code", status.HTTP_400_BAD_REQUEST),
                headers=headers,
            )
        error_msg = f"{str(e)}"
        raise ProxyException(
            message=getattr(e, "message", error_msg),
            type=getattr(e, "type", "None"),
            param=getattr(e, "param", "None"),
            openai_code=getattr(e, "code", None),
            code=getattr(e, "status_code", 500),
            headers=headers,
        )
@ -3837,7 +3664,7 @@ async def completion(  # noqa: PLR0915
        if (
            "stream" in data and data["stream"] is True
        ):  # use generate_responses to stream responses
-            custom_headers = get_custom_headers(
+            custom_headers = ProxyBaseLLMRequestProcessing.get_custom_headers(
                user_api_key_dict=user_api_key_dict,
                call_id=litellm_call_id,
                model_id=model_id,
@ -3865,7 +3692,7 @@ async def completion(  # noqa: PLR0915
        )
        fastapi_response.headers.update(
-            get_custom_headers(
+            ProxyBaseLLMRequestProcessing.get_custom_headers(
                user_api_key_dict=user_api_key_dict,
                call_id=litellm_call_id,
                model_id=model_id,
@ -4096,7 +3923,7 @@ async def embeddings(  # noqa: PLR0915
        additional_headers: dict = hidden_params.get("additional_headers", {}) or {}
        fastapi_response.headers.update(
-            get_custom_headers(
+            ProxyBaseLLMRequestProcessing.get_custom_headers(
                user_api_key_dict=user_api_key_dict,
                model_id=model_id,
                cache_key=cache_key,
@ -4224,7 +4051,7 @@ async def image_generation(
        litellm_call_id = hidden_params.get("litellm_call_id", None) or ""
        fastapi_response.headers.update(
-            get_custom_headers(
+            ProxyBaseLLMRequestProcessing.get_custom_headers(
                user_api_key_dict=user_api_key_dict,
                model_id=model_id,
                cache_key=cache_key,
@ -4345,7 +4172,7 @@ async def audio_speech(
            async for chunk in _generator:
                yield chunk
-        custom_headers = get_custom_headers(
+        custom_headers = ProxyBaseLLMRequestProcessing.get_custom_headers(
            user_api_key_dict=user_api_key_dict,
            model_id=model_id,
            cache_key=cache_key,
@ -4486,7 +4313,7 @@ async def audio_transcriptions(
        additional_headers: dict = hidden_params.get("additional_headers", {}) or {}
        fastapi_response.headers.update(
-            get_custom_headers(
+            ProxyBaseLLMRequestProcessing.get_custom_headers(
                user_api_key_dict=user_api_key_dict,
                model_id=model_id,
                cache_key=cache_key,
@ -4638,7 +4465,7 @@ async def get_assistants(
        api_base = hidden_params.get("api_base", None) or ""
        fastapi_response.headers.update(
-            get_custom_headers(
+            ProxyBaseLLMRequestProcessing.get_custom_headers(
                user_api_key_dict=user_api_key_dict,
                model_id=model_id,
                cache_key=cache_key,
@ -4737,7 +4564,7 @@ async def create_assistant(
        api_base = hidden_params.get("api_base", None) or ""
        fastapi_response.headers.update(
-            get_custom_headers(
+            ProxyBaseLLMRequestProcessing.get_custom_headers(
                user_api_key_dict=user_api_key_dict,
                model_id=model_id,
                cache_key=cache_key,
@ -4834,7 +4661,7 @@ async def delete_assistant(
        api_base = hidden_params.get("api_base", None) or ""
        fastapi_response.headers.update(
-            get_custom_headers(
+            ProxyBaseLLMRequestProcessing.get_custom_headers(
                user_api_key_dict=user_api_key_dict,
                model_id=model_id,
                cache_key=cache_key,
@ -4931,7 +4758,7 @@ async def create_threads(
        api_base = hidden_params.get("api_base", None) or ""
        fastapi_response.headers.update(
-            get_custom_headers(
+            ProxyBaseLLMRequestProcessing.get_custom_headers(
                user_api_key_dict=user_api_key_dict,
                model_id=model_id,
                cache_key=cache_key,
@ -5027,7 +4854,7 @@ async def get_thread(
        api_base = hidden_params.get("api_base", None) or ""
        fastapi_response.headers.update(
-            get_custom_headers(
+            ProxyBaseLLMRequestProcessing.get_custom_headers(
                user_api_key_dict=user_api_key_dict,
                model_id=model_id,
                cache_key=cache_key,
@ -5126,7 +4953,7 @@ async def add_messages(
        api_base = hidden_params.get("api_base", None) or ""
        fastapi_response.headers.update(
-            get_custom_headers(
+            ProxyBaseLLMRequestProcessing.get_custom_headers(
                user_api_key_dict=user_api_key_dict,
                model_id=model_id,
                cache_key=cache_key,
@ -5221,7 +5048,7 @@ async def get_messages(
        api_base = hidden_params.get("api_base", None) or ""
        fastapi_response.headers.update(
-            get_custom_headers(
+            ProxyBaseLLMRequestProcessing.get_custom_headers(
                user_api_key_dict=user_api_key_dict,
                model_id=model_id,
                cache_key=cache_key,
@ -5330,7 +5157,7 @@ async def run_thread(
        api_base = hidden_params.get("api_base", None) or ""
        fastapi_response.headers.update(
-            get_custom_headers(
+            ProxyBaseLLMRequestProcessing.get_custom_headers(
                user_api_key_dict=user_api_key_dict,
                model_id=model_id,
                cache_key=cache_key,
@ -5453,7 +5280,7 @@ async def moderations(
        api_base = hidden_params.get("api_base", None) or ""
        fastapi_response.headers.update(
-            get_custom_headers(
+            ProxyBaseLLMRequestProcessing.get_custom_headers(
                user_api_key_dict=user_api_key_dict,
                model_id=model_id,
                cache_key=cache_key,
@ -8597,9 +8424,11 @@ async def get_routes():
 app.include_router(router)
 app.include_router(response_router)
 app.include_router(batches_router)
 app.include_router(rerank_router)
 app.include_router(fine_tuning_router)
 app.include_router(credential_router)
 app.include_router(vertex_router)
 app.include_router(llm_passthrough_router)
 app.include_router(anthropic_router)
--- a/litellm/proxy/rerank_endpoints/endpoints.py
+++ b/litellm/proxy/rerank_endpoints/endpoints.py
@ -7,10 +7,12 @@ from fastapi.responses import ORJSONResponse
 from litellm._logging import verbose_proxy_logger
 from litellm.proxy._types import *
 from litellm.proxy.auth.user_api_key_auth import user_api_key_auth
 from litellm.proxy.common_request_processing import ProxyBaseLLMRequestProcessing
 router = APIRouter()
 import asyncio
@router.post(
    "/v2/rerank",
    dependencies=[Depends(user_api_key_auth)],
@ -37,7 +39,6 @@ async def rerank(
    from litellm.proxy.proxy_server import (
        add_litellm_data_to_request,
        general_settings,
        get_custom_headers,
        llm_router,
        proxy_config,
        proxy_logging_obj,
@ -89,7 +90,7 @@ async def rerank(
        api_base = hidden_params.get("api_base", None) or ""
        additional_headers = hidden_params.get("additional_headers", None) or {}
        fastapi_response.headers.update(
-            get_custom_headers(
+            ProxyBaseLLMRequestProcessing.get_custom_headers(
                user_api_key_dict=user_api_key_dict,
                model_id=model_id,
                cache_key=cache_key,
--- a/litellm/proxy/response_api_endpoints/endpoints.py
+++ b/litellm/proxy/response_api_endpoints/endpoints.py
@ -0,0 +1,80 @@
 from fastapi import APIRouter, Depends, Request, Response
 from litellm.proxy._types import *
 from litellm.proxy.auth.user_api_key_auth import UserAPIKeyAuth, user_api_key_auth
 from litellm.proxy.common_request_processing import ProxyBaseLLMRequestProcessing
 router = APIRouter()
@router.post(
    "/v1/responses",
    dependencies=[Depends(user_api_key_auth)],
    tags=["responses"],
 )
@router.post(
    "/responses",
    dependencies=[Depends(user_api_key_auth)],
    tags=["responses"],
 )
 async def responses_api(
    request: Request,
    fastapi_response: Response,
    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
 ):
    """
    Follows the OpenAI Responses API spec: https://platform.openai.com/docs/api-reference/responses
    ```bash
    curl -X POST http://localhost:4000/v1/responses \
    -H "Content-Type: application/json" \
    -H "Authorization: Bearer sk-1234" \
    -d '{
        "model": "gpt-4o",
        "input": "Tell me about AI"
    }'
    ```
    """
    from litellm.proxy.proxy_server import (
        _read_request_body,
        general_settings,
        llm_router,
        proxy_config,
        proxy_logging_obj,
        select_data_generator,
        user_api_base,
        user_max_tokens,
        user_model,
        user_request_timeout,
        user_temperature,
        version,
    )
    data = await _read_request_body(request=request)
    processor = ProxyBaseLLMRequestProcessing(data=data)
    try:
        return await processor.base_process_llm_request(
            request=request,
            fastapi_response=fastapi_response,
            user_api_key_dict=user_api_key_dict,
            route_type="aresponses",
            proxy_logging_obj=proxy_logging_obj,
            llm_router=llm_router,
            general_settings=general_settings,
            proxy_config=proxy_config,
            select_data_generator=select_data_generator,
            model=None,
            user_model=user_model,
            user_temperature=user_temperature,
            user_request_timeout=user_request_timeout,
            user_max_tokens=user_max_tokens,
            user_api_base=user_api_base,
            version=version,
        )
    except Exception as e:
        raise await processor._handle_llm_api_exception(
            e=e,
            user_api_key_dict=user_api_key_dict,
            proxy_logging_obj=proxy_logging_obj,
            version=version,
        )
--- a/litellm/proxy/route_llm_request.py
+++ b/litellm/proxy/route_llm_request.py
@ -21,6 +21,7 @@ ROUTE_ENDPOINT_MAPPING = {
    "atranscription": "/audio/transcriptions",
    "amoderation": "/moderations",
    "arerank": "/rerank",
    "aresponses": "/responses",
 }
@ -45,6 +46,7 @@ async def route_request(
        "atranscription",
        "amoderation",
        "arerank",
        "aresponses",
        "_arealtime",  # private function for realtime API
    ],
 ):
--- a/litellm/proxy/schema.prisma
+++ b/litellm/proxy/schema.prisma
@ -29,6 +29,18 @@ model LiteLLM_BudgetTable {
  organization_membership LiteLLM_OrganizationMembership[] // budgets of Users within a Organization 
 }
 // Models on proxy
 model LiteLLM_CredentialsTable {
  credential_id String @id @default(uuid())
  credential_name String @unique
  credential_values Json
  credential_info Json? 
  created_at    DateTime               @default(now()) @map("created_at")
  created_by String
  updated_at    DateTime               @default(now()) @updatedAt @map("updated_at")
  updated_by String
 }
 // Models on proxy
 model LiteLLM_ProxyModelTable {
  model_id String @id @default(uuid())
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@ -537,6 +537,7 @@ class ProxyLogging:
        user_api_key_dict: UserAPIKeyAuth,
        call_type: Literal[
            "completion",
            "responses",
            "embeddings",
            "image_generation",
            "moderation",
--- a/litellm/responses/main.py
+++ b/litellm/responses/main.py
@ -0,0 +1,217 @@
 import asyncio
 import contextvars
 from functools import partial
 from typing import Any, Dict, Iterable, List, Literal, Optional, Union
 import httpx
 import litellm
 from litellm.constants import request_timeout
 from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
 from litellm.llms.base_llm.responses.transformation import BaseResponsesAPIConfig
 from litellm.llms.custom_httpx.llm_http_handler import BaseLLMHTTPHandler
 from litellm.responses.utils import ResponsesAPIRequestUtils
 from litellm.types.llms.openai import (
    Reasoning,
    ResponseIncludable,
    ResponseInputParam,
    ResponsesAPIOptionalRequestParams,
    ResponsesAPIResponse,
    ResponseTextConfigParam,
    ToolChoice,
    ToolParam,
 )
 from litellm.types.router import GenericLiteLLMParams
 from litellm.utils import ProviderConfigManager, client
 from .streaming_iterator import BaseResponsesAPIStreamingIterator
 ####### ENVIRONMENT VARIABLES ###################
 # Initialize any necessary instances or variables here
 base_llm_http_handler = BaseLLMHTTPHandler()
 #################################################
@client
 async def aresponses(
    input: Union[str, ResponseInputParam],
    model: str,
    include: Optional[List[ResponseIncludable]] = None,
    instructions: Optional[str] = None,
    max_output_tokens: Optional[int] = None,
    metadata: Optional[Dict[str, Any]] = None,
    parallel_tool_calls: Optional[bool] = None,
    previous_response_id: Optional[str] = None,
    reasoning: Optional[Reasoning] = None,
    store: Optional[bool] = None,
    stream: Optional[bool] = None,
    temperature: Optional[float] = None,
    text: Optional[ResponseTextConfigParam] = None,
    tool_choice: Optional[ToolChoice] = None,
    tools: Optional[Iterable[ToolParam]] = None,
    top_p: Optional[float] = None,
    truncation: Optional[Literal["auto", "disabled"]] = None,
    user: Optional[str] = None,
    # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
    # The extra values given here take precedence over values defined on the client or passed to this method.
    extra_headers: Optional[Dict[str, Any]] = None,
    extra_query: Optional[Dict[str, Any]] = None,
    extra_body: Optional[Dict[str, Any]] = None,
    timeout: Optional[Union[float, httpx.Timeout]] = None,
    **kwargs,
 ) -> Union[ResponsesAPIResponse, BaseResponsesAPIStreamingIterator]:
    """
    Async: Handles responses API requests by reusing the synchronous function
    """
    try:
        loop = asyncio.get_event_loop()
        kwargs["aresponses"] = True
        func = partial(
            responses,
            input=input,
            model=model,
            include=include,
            instructions=instructions,
            max_output_tokens=max_output_tokens,
            metadata=metadata,
            parallel_tool_calls=parallel_tool_calls,
            previous_response_id=previous_response_id,
            reasoning=reasoning,
            store=store,
            stream=stream,
            temperature=temperature,
            text=text,
            tool_choice=tool_choice,
            tools=tools,
            top_p=top_p,
            truncation=truncation,
            user=user,
            extra_headers=extra_headers,
            extra_query=extra_query,
            extra_body=extra_body,
            timeout=timeout,
            **kwargs,
        )
        ctx = contextvars.copy_context()
        func_with_context = partial(ctx.run, func)
        init_response = await loop.run_in_executor(None, func_with_context)
        if asyncio.iscoroutine(init_response):
            response = await init_response
        else:
            response = init_response
        return response
    except Exception as e:
        raise e
@client
 def responses(
    input: Union[str, ResponseInputParam],
    model: str,
    include: Optional[List[ResponseIncludable]] = None,
    instructions: Optional[str] = None,
    max_output_tokens: Optional[int] = None,
    metadata: Optional[Dict[str, Any]] = None,
    parallel_tool_calls: Optional[bool] = None,
    previous_response_id: Optional[str] = None,
    reasoning: Optional[Reasoning] = None,
    store: Optional[bool] = None,
    stream: Optional[bool] = None,
    temperature: Optional[float] = None,
    text: Optional[ResponseTextConfigParam] = None,
    tool_choice: Optional[ToolChoice] = None,
    tools: Optional[Iterable[ToolParam]] = None,
    top_p: Optional[float] = None,
    truncation: Optional[Literal["auto", "disabled"]] = None,
    user: Optional[str] = None,
    # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
    # The extra values given here take precedence over values defined on the client or passed to this method.
    extra_headers: Optional[Dict[str, Any]] = None,
    extra_query: Optional[Dict[str, Any]] = None,
    extra_body: Optional[Dict[str, Any]] = None,
    timeout: Optional[Union[float, httpx.Timeout]] = None,
    **kwargs,
 ):
    """
    Synchronous version of the Responses API.
    Uses the synchronous HTTP handler to make requests.
    """
    litellm_logging_obj: LiteLLMLoggingObj = kwargs.get("litellm_logging_obj")  # type: ignore
    litellm_call_id: Optional[str] = kwargs.get("litellm_call_id", None)
    _is_async = kwargs.pop("aresponses", False) is True
    # get llm provider logic
    litellm_params = GenericLiteLLMParams(**kwargs)
    model, custom_llm_provider, dynamic_api_key, dynamic_api_base = (
        litellm.get_llm_provider(
            model=model,
            custom_llm_provider=kwargs.get("custom_llm_provider", None),
            api_base=litellm_params.api_base,
            api_key=litellm_params.api_key,
        )
    )
    # get provider config
    responses_api_provider_config: Optional[BaseResponsesAPIConfig] = (
        ProviderConfigManager.get_provider_responses_api_config(
            model=model,
            provider=litellm.LlmProviders(custom_llm_provider),
        )
    )
    if responses_api_provider_config is None:
        raise litellm.BadRequestError(
            model=model,
            llm_provider=custom_llm_provider,
            message=f"Responses API not available for custom_llm_provider={custom_llm_provider}, model: {model}",
        )
    # Get all parameters using locals() and combine with kwargs
    local_vars = locals()
    local_vars.update(kwargs)
    # Get ResponsesAPIOptionalRequestParams with only valid parameters
    response_api_optional_params: ResponsesAPIOptionalRequestParams = (
        ResponsesAPIRequestUtils.get_requested_response_api_optional_param(local_vars)
    )
    # Get optional parameters for the responses API
    responses_api_request_params: Dict = (
        ResponsesAPIRequestUtils.get_optional_params_responses_api(
            model=model,
            responses_api_provider_config=responses_api_provider_config,
            response_api_optional_params=response_api_optional_params,
        )
    )
    # Pre Call logging
    litellm_logging_obj.update_environment_variables(
        model=model,
        user=user,
        optional_params=dict(responses_api_request_params),
        litellm_params={
            "litellm_call_id": litellm_call_id,
            **responses_api_request_params,
        },
        custom_llm_provider=custom_llm_provider,
    )
    # Call the handler with _is_async flag instead of directly calling the async handler
    response = base_llm_http_handler.response_api_handler(
        model=model,
        input=input,
        responses_api_provider_config=responses_api_provider_config,
        response_api_optional_request_params=responses_api_request_params,
        custom_llm_provider=custom_llm_provider,
        litellm_params=litellm_params,
        logging_obj=litellm_logging_obj,
        extra_headers=extra_headers,
        extra_body=extra_body,
        timeout=timeout or request_timeout,
        _is_async=_is_async,
        client=kwargs.get("client"),
    )
    return response
--- a/litellm/responses/streaming_iterator.py
+++ b/litellm/responses/streaming_iterator.py
@ -0,0 +1,209 @@
 import asyncio
 import json
 from datetime import datetime
 from typing import Optional
 import httpx
 from litellm.constants import STREAM_SSE_DONE_STRING
 from litellm.litellm_core_utils.asyncify import run_async_function
 from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
 from litellm.litellm_core_utils.thread_pool_executor import executor
 from litellm.llms.base_llm.responses.transformation import BaseResponsesAPIConfig
 from litellm.types.llms.openai import (
    ResponsesAPIStreamEvents,
    ResponsesAPIStreamingResponse,
 )
 from litellm.utils import CustomStreamWrapper
 class BaseResponsesAPIStreamingIterator:
    """
    Base class for streaming iterators that process responses from the Responses API.
    This class contains shared logic for both synchronous and asynchronous iterators.
    """
    def __init__(
        self,
        response: httpx.Response,
        model: str,
        responses_api_provider_config: BaseResponsesAPIConfig,
        logging_obj: LiteLLMLoggingObj,
    ):
        self.response = response
        self.model = model
        self.logging_obj = logging_obj
        self.finished = False
        self.responses_api_provider_config = responses_api_provider_config
        self.completed_response: Optional[ResponsesAPIStreamingResponse] = None
        self.start_time = datetime.now()
    def _process_chunk(self, chunk):
        """Process a single chunk of data from the stream"""
        if not chunk:
            return None
        # Handle SSE format (data: {...})
        chunk = CustomStreamWrapper._strip_sse_data_from_chunk(chunk)
        if chunk is None:
            return None
        # Handle "[DONE]" marker
        if chunk == STREAM_SSE_DONE_STRING:
            self.finished = True
            return None
        try:
            # Parse the JSON chunk
            parsed_chunk = json.loads(chunk)
            # Format as ResponsesAPIStreamingResponse
            if isinstance(parsed_chunk, dict):
                openai_responses_api_chunk = (
                    self.responses_api_provider_config.transform_streaming_response(
                        model=self.model,
                        parsed_chunk=parsed_chunk,
                        logging_obj=self.logging_obj,
                    )
                )
                # Store the completed response
                if (
                    openai_responses_api_chunk
                    and openai_responses_api_chunk.type
                    == ResponsesAPIStreamEvents.RESPONSE_COMPLETED
                ):
                    self.completed_response = openai_responses_api_chunk
                    self._handle_logging_completed_response()
                return openai_responses_api_chunk
            return None
        except json.JSONDecodeError:
            # If we can't parse the chunk, continue
            return None
    def _handle_logging_completed_response(self):
        """Base implementation - should be overridden by subclasses"""
        pass
 class ResponsesAPIStreamingIterator(BaseResponsesAPIStreamingIterator):
    """
    Async iterator for processing streaming responses from the Responses API.
    """
    def __init__(
        self,
        response: httpx.Response,
        model: str,
        responses_api_provider_config: BaseResponsesAPIConfig,
        logging_obj: LiteLLMLoggingObj,
    ):
        super().__init__(response, model, responses_api_provider_config, logging_obj)
        self.stream_iterator = response.aiter_lines()
    def __aiter__(self):
        return self
    async def __anext__(self) -> ResponsesAPIStreamingResponse:
        try:
            while True:
                # Get the next chunk from the stream
                try:
                    chunk = await self.stream_iterator.__anext__()
                except StopAsyncIteration:
                    self.finished = True
                    raise StopAsyncIteration
                result = self._process_chunk(chunk)
                if self.finished:
                    raise StopAsyncIteration
                elif result is not None:
                    return result
                # If result is None, continue the loop to get the next chunk
        except httpx.HTTPError as e:
            # Handle HTTP errors
            self.finished = True
            raise e
    def _handle_logging_completed_response(self):
        """Handle logging for completed responses in async context"""
        asyncio.create_task(
            self.logging_obj.async_success_handler(
                result=self.completed_response,
                start_time=self.start_time,
                end_time=datetime.now(),
                cache_hit=None,
            )
        )
        executor.submit(
            self.logging_obj.success_handler,
            result=self.completed_response,
            cache_hit=None,
            start_time=self.start_time,
            end_time=datetime.now(),
        )
 class SyncResponsesAPIStreamingIterator(BaseResponsesAPIStreamingIterator):
    """
    Synchronous iterator for processing streaming responses from the Responses API.
    """
    def __init__(
        self,
        response: httpx.Response,
        model: str,
        responses_api_provider_config: BaseResponsesAPIConfig,
        logging_obj: LiteLLMLoggingObj,
    ):
        super().__init__(response, model, responses_api_provider_config, logging_obj)
        self.stream_iterator = response.iter_lines()
    def __iter__(self):
        return self
    def __next__(self):
        try:
            while True:
                # Get the next chunk from the stream
                try:
                    chunk = next(self.stream_iterator)
                except StopIteration:
                    self.finished = True
                    raise StopIteration
                result = self._process_chunk(chunk)
                if self.finished:
                    raise StopIteration
                elif result is not None:
                    return result
                # If result is None, continue the loop to get the next chunk
        except httpx.HTTPError as e:
            # Handle HTTP errors
            self.finished = True
            raise e
    def _handle_logging_completed_response(self):
        """Handle logging for completed responses in sync context"""
        run_async_function(
            async_function=self.logging_obj.async_success_handler,
            result=self.completed_response,
            start_time=self.start_time,
            end_time=datetime.now(),
            cache_hit=None,
        )
        executor.submit(
            self.logging_obj.success_handler,
            result=self.completed_response,
            cache_hit=None,
            start_time=self.start_time,
            end_time=datetime.now(),
        )
--- a/litellm/responses/utils.py
+++ b/litellm/responses/utils.py
@ -0,0 +1,97 @@
 from typing import Any, Dict, cast, get_type_hints
 import litellm
 from litellm.llms.base_llm.responses.transformation import BaseResponsesAPIConfig
 from litellm.types.llms.openai import (
    ResponseAPIUsage,
    ResponsesAPIOptionalRequestParams,
 )
 from litellm.types.utils import Usage
 class ResponsesAPIRequestUtils:
    """Helper utils for constructing ResponseAPI requests"""
    @staticmethod
    def get_optional_params_responses_api(
        model: str,
        responses_api_provider_config: BaseResponsesAPIConfig,
        response_api_optional_params: ResponsesAPIOptionalRequestParams,
    ) -> Dict:
        """
        Get optional parameters for the responses API.
        Args:
            params: Dictionary of all parameters
            model: The model name
            responses_api_provider_config: The provider configuration for responses API
        Returns:
            A dictionary of supported parameters for the responses API
        """
        # Remove None values and internal parameters
        # Get supported parameters for the model
        supported_params = responses_api_provider_config.get_supported_openai_params(
            model
        )
        # Check for unsupported parameters
        unsupported_params = [
            param
            for param in response_api_optional_params
            if param not in supported_params
        ]
        if unsupported_params:
            raise litellm.UnsupportedParamsError(
                model=model,
                message=f"The following parameters are not supported for model {model}: {', '.join(unsupported_params)}",
            )
        # Map parameters to provider-specific format
        mapped_params = responses_api_provider_config.map_openai_params(
            response_api_optional_params=response_api_optional_params,
            model=model,
            drop_params=litellm.drop_params,
        )
        return mapped_params
    @staticmethod
    def get_requested_response_api_optional_param(
        params: Dict[str, Any]
    ) -> ResponsesAPIOptionalRequestParams:
        """
        Filter parameters to only include those defined in ResponsesAPIOptionalRequestParams.
        Args:
            params: Dictionary of parameters to filter
        Returns:
            ResponsesAPIOptionalRequestParams instance with only the valid parameters
        """
        valid_keys = get_type_hints(ResponsesAPIOptionalRequestParams).keys()
        filtered_params = {k: v for k, v in params.items() if k in valid_keys}
        return cast(ResponsesAPIOptionalRequestParams, filtered_params)
 class ResponseAPILoggingUtils:
    @staticmethod
    def _is_response_api_usage(usage: dict) -> bool:
        """returns True if usage is from OpenAI Response API"""
        if "input_tokens" in usage and "output_tokens" in usage:
            return True
        return False
    @staticmethod
    def _transform_response_api_usage_to_chat_usage(usage: dict) -> Usage:
        """Tranforms the ResponseAPIUsage object to a Usage object"""
        response_api_usage: ResponseAPIUsage = ResponseAPIUsage(**usage)
        prompt_tokens: int = response_api_usage.input_tokens or 0
        completion_tokens: int = response_api_usage.output_tokens or 0
        return Usage(
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
            total_tokens=prompt_tokens + completion_tokens,
        )
--- a/litellm/router.py
+++ b/litellm/router.py
@ -71,7 +71,7 @@ from litellm.router_utils.batch_utils import (
    _get_router_metadata_variable_name,
    replace_model_in_jsonl,
 )
-from litellm.router_utils.client_initalization_utils import InitalizeOpenAISDKClient
+from litellm.router_utils.client_initalization_utils import InitalizeCachedClient
 from litellm.router_utils.clientside_credential_handler import (
    get_dynamic_litellm_params,
    is_clientside_credential,
@ -581,13 +581,7 @@ class Router:
            self._initialize_alerting()
        self.initialize_assistants_endpoint()
-
+        self.initialize_router_endpoints()
        self.amoderation = self.factory_function(
            litellm.amoderation, call_type="moderation"
        )
        self.aanthropic_messages = self.factory_function(
            litellm.anthropic_messages, call_type="anthropic_messages"
        )
    def discard(self):
        """
@ -653,6 +647,18 @@ class Router:
        self.aget_messages = self.factory_function(litellm.aget_messages)
        self.arun_thread = self.factory_function(litellm.arun_thread)
    def initialize_router_endpoints(self):
        self.amoderation = self.factory_function(
            litellm.amoderation, call_type="moderation"
        )
        self.aanthropic_messages = self.factory_function(
            litellm.anthropic_messages, call_type="anthropic_messages"
        )
        self.aresponses = self.factory_function(
            litellm.aresponses, call_type="aresponses"
        )
        self.responses = self.factory_function(litellm.responses, call_type="responses")
    def routing_strategy_init(
        self, routing_strategy: Union[RoutingStrategy, str], routing_strategy_args: dict
    ):
@ -955,6 +961,7 @@ class Router:
                specific_deployment=kwargs.pop("specific_deployment", None),
                request_kwargs=kwargs,
            )
            _timeout_debug_deployment_dict = deployment
            end_time = time.time()
            _duration = end_time - start_time
@ -1079,17 +1086,22 @@ class Router:
        kwargs.setdefault("litellm_trace_id", str(uuid.uuid4()))
        kwargs.setdefault("metadata", {}).update({"model_group": model})
-    def _update_kwargs_with_default_litellm_params(self, kwargs: dict) -> None:
+    def _update_kwargs_with_default_litellm_params(
        self, kwargs: dict, metadata_variable_name: Optional[str] = "metadata"
    ) -> None:
        """
        Adds default litellm params to kwargs, if set.
        """
        self.default_litellm_params[metadata_variable_name] = (
            self.default_litellm_params.pop("metadata", {})
        )
        for k, v in self.default_litellm_params.items():
            if (
                k not in kwargs and v is not None
            ):  # prioritize model-specific params > default router params
                kwargs[k] = v
-            elif k == "metadata":
+            elif k == metadata_variable_name:
-                kwargs[k].update(v)
+                kwargs[metadata_variable_name].update(v)
    def _handle_clientside_credential(
        self, deployment: dict, kwargs: dict
@ -1120,7 +1132,12 @@ class Router:
        )  # add new deployment to router
        return deployment_pydantic_obj
-    def _update_kwargs_with_deployment(self, deployment: dict, kwargs: dict) -> None:
+    def _update_kwargs_with_deployment(
        self,
        deployment: dict,
        kwargs: dict,
        function_name: Optional[str] = None,
    ) -> None:
        """
        2 jobs:
        - Adds selected deployment, model_info and api_base to kwargs["metadata"] (used for logging)
@ -1137,7 +1154,10 @@ class Router:
            deployment_model_name = deployment_pydantic_obj.litellm_params.model
            deployment_api_base = deployment_pydantic_obj.litellm_params.api_base
-        kwargs.setdefault("metadata", {}).update(
+        metadata_variable_name = _get_router_metadata_variable_name(
            function_name=function_name,
        )
        kwargs.setdefault(metadata_variable_name, {}).update(
            {
                "deployment": deployment_model_name,
                "model_info": model_info,
@ -1150,7 +1170,9 @@ class Router:
            kwargs=kwargs, data=deployment["litellm_params"]
        )
-        self._update_kwargs_with_default_litellm_params(kwargs=kwargs)
+        self._update_kwargs_with_default_litellm_params(
            kwargs=kwargs, metadata_variable_name=metadata_variable_name
        )
    def _get_async_openai_model_client(self, deployment: dict, kwargs: dict):
        """
@ -2395,22 +2417,18 @@ class Router:
                messages=kwargs.get("messages", None),
                specific_deployment=kwargs.pop("specific_deployment", None),
            )
-            self._update_kwargs_with_deployment(deployment=deployment, kwargs=kwargs)
+            self._update_kwargs_with_deployment(
                deployment=deployment, kwargs=kwargs, function_name="generic_api_call"
            )
            data = deployment["litellm_params"].copy()
            model_name = data["model"]
            model_client = self._get_async_openai_model_client(
                deployment=deployment,
                kwargs=kwargs,
            )
            self.total_calls[model_name] += 1
            response = original_function(
                **{
                    **data,
                    "caching": self.cache_responses,
                    "client": model_client,
                    **kwargs,
                }
            )
@ -2452,6 +2470,61 @@ class Router:
                self.fail_calls[model] += 1
            raise e
    def _generic_api_call_with_fallbacks(
        self, model: str, original_function: Callable, **kwargs
    ):
        """
        Make a generic LLM API call through the router, this allows you to use retries/fallbacks with litellm router
        Args:
            model: The model to use
            original_function: The handler function to call (e.g., litellm.completion)
            **kwargs: Additional arguments to pass to the handler function
        Returns:
            The response from the handler function
        """
        handler_name = original_function.__name__
        try:
            verbose_router_logger.debug(
                f"Inside _generic_api_call() - handler: {handler_name}, model: {model}; kwargs: {kwargs}"
            )
            deployment = self.get_available_deployment(
                model=model,
                messages=kwargs.get("messages", None),
                specific_deployment=kwargs.pop("specific_deployment", None),
            )
            self._update_kwargs_with_deployment(
                deployment=deployment, kwargs=kwargs, function_name="generic_api_call"
            )
            data = deployment["litellm_params"].copy()
            model_name = data["model"]
            self.total_calls[model_name] += 1
            # Perform pre-call checks for routing strategy
            self.routing_strategy_pre_call_checks(deployment=deployment)
            response = original_function(
                **{
                    **data,
                    "caching": self.cache_responses,
                    **kwargs,
                }
            )
            self.success_calls[model_name] += 1
            verbose_router_logger.info(
                f"{handler_name}(model={model_name})\033[32m 200 OK\033[0m"
            )
            return response
        except Exception as e:
            verbose_router_logger.info(
                f"{handler_name}(model={model})\033[31m Exception {str(e)}\033[0m"
            )
            if model is not None:
                self.fail_calls[model] += 1
            raise e
    def embedding(
        self,
        model: str,
@ -2973,14 +3046,42 @@ class Router:
        self,
        original_function: Callable,
        call_type: Literal[
-            "assistants", "moderation", "anthropic_messages"
+            "assistants",
            "moderation",
            "anthropic_messages",
            "aresponses",
            "responses",
        ] = "assistants",
    ):
-        async def new_function(
+        """
        Creates appropriate wrapper functions for different API call types.
        Returns:
            - A synchronous function for synchronous call types
            - An asynchronous function for asynchronous call types
        """
        # Handle synchronous call types
        if call_type == "responses":
            def sync_wrapper(
                custom_llm_provider: Optional[
                    Literal["openai", "azure", "anthropic"]
                ] = None,
-            client: Optional["AsyncOpenAI"] = None,
+                client: Optional[Any] = None,
                **kwargs,
            ):
                return self._generic_api_call_with_fallbacks(
                    original_function=original_function, **kwargs
                )
            return sync_wrapper
        # Handle asynchronous call types
        async def async_wrapper(
            custom_llm_provider: Optional[
                Literal["openai", "azure", "anthropic"]
            ] = None,
            client: Optional[Any] = None,
            **kwargs,
        ):
            if call_type == "assistants":
@ -2991,18 +3092,16 @@ class Router:
                    **kwargs,
                )
            elif call_type == "moderation":
-
+                return await self._pass_through_moderation_endpoint_factory(
-                return await self._pass_through_moderation_endpoint_factory(  # type: ignore
+                    original_function=original_function, **kwargs
                    original_function=original_function,
                    **kwargs,
                )
-            elif call_type == "anthropic_messages":
+            elif call_type in ("anthropic_messages", "aresponses"):
-                return await self._ageneric_api_call_with_fallbacks(  # type: ignore
+                return await self._ageneric_api_call_with_fallbacks(
                    original_function=original_function,
                    **kwargs,
                )
-        return new_function
+        return async_wrapper
    async def _pass_through_assistants_endpoint_factory(
        self,
@ -4373,10 +4472,10 @@ class Router:
        if custom_llm_provider not in litellm.provider_list:
            raise Exception(f"Unsupported provider - {custom_llm_provider}")
-        # init OpenAI, Azure clients
+        # # init OpenAI, Azure clients
-        InitalizeOpenAISDKClient.set_client(
+        # InitalizeOpenAISDKClient.set_client(
-            litellm_router_instance=self, model=deployment.to_json(exclude_none=True)
+        #     litellm_router_instance=self, model=deployment.to_json(exclude_none=True)
-        )
+        # )
        self._initialize_deployment_for_pass_through(
            deployment=deployment,
@ -5345,6 +5444,13 @@ class Router:
            client = self.cache.get_cache(
                key=cache_key, local_only=True, parent_otel_span=parent_otel_span
            )
            if client is None:
                InitalizeCachedClient.set_max_parallel_requests_client(
                    litellm_router_instance=self, model=deployment
                )
                client = self.cache.get_cache(
                    key=cache_key, local_only=True, parent_otel_span=parent_otel_span
                )
            return client
        elif client_type == "async":
            if kwargs.get("stream") is True:
@ -5352,36 +5458,12 @@ class Router:
                client = self.cache.get_cache(
                    key=cache_key, local_only=True, parent_otel_span=parent_otel_span
                )
                if client is None:
                    """
                    Re-initialize the client
                    """
                    InitalizeOpenAISDKClient.set_client(
                        litellm_router_instance=self, model=deployment
                    )
                    client = self.cache.get_cache(
                        key=cache_key,
                        local_only=True,
                        parent_otel_span=parent_otel_span,
                    )
                return client
            else:
                cache_key = f"{model_id}_async_client"
                client = self.cache.get_cache(
                    key=cache_key, local_only=True, parent_otel_span=parent_otel_span
                )
                if client is None:
                    """
                    Re-initialize the client
                    """
                    InitalizeOpenAISDKClient.set_client(
                        litellm_router_instance=self, model=deployment
                    )
                    client = self.cache.get_cache(
                        key=cache_key,
                        local_only=True,
                        parent_otel_span=parent_otel_span,
                    )
                return client
        else:
            if kwargs.get("stream") is True:
@ -5389,32 +5471,12 @@ class Router:
                client = self.cache.get_cache(
                    key=cache_key, parent_otel_span=parent_otel_span
                )
                if client is None:
                    """
                    Re-initialize the client
                    """
                    InitalizeOpenAISDKClient.set_client(
                        litellm_router_instance=self, model=deployment
                    )
                    client = self.cache.get_cache(
                        key=cache_key, parent_otel_span=parent_otel_span
                    )
                return client
            else:
                cache_key = f"{model_id}_client"
                client = self.cache.get_cache(
                    key=cache_key, parent_otel_span=parent_otel_span
                )
                if client is None:
                    """
                    Re-initialize the client
                    """
                    InitalizeOpenAISDKClient.set_client(
                        litellm_router_instance=self, model=deployment
                    )
                    client = self.cache.get_cache(
                        key=cache_key, parent_otel_span=parent_otel_span
                    )
                return client
    def _pre_call_checks(  # noqa: PLR0915
--- a/litellm/router_utils/batch_utils.py
+++ b/litellm/router_utils/batch_utils.py
@ -56,7 +56,8 @@ def _get_router_metadata_variable_name(function_name) -> str:
    For ALL other endpoints we call this "metadata
    """
-    if "batch" in function_name:
+    ROUTER_METHODS_USING_LITELLM_METADATA = set(["batch", "generic_api_call"])
    if function_name in ROUTER_METHODS_USING_LITELLM_METADATA:
        return "litellm_metadata"
    else:
        return "metadata"
--- a/litellm/router_utils/client_initalization_utils.py
+++ b/litellm/router_utils/client_initalization_utils.py
@ -1,21 +1,6 @@
 import asyncio
-import os
+from typing import TYPE_CHECKING, Any
 from typing import TYPE_CHECKING, Any, Callable, Optional
 import httpx
 import openai
 import litellm
 from litellm import get_secret, get_secret_str
 from litellm._logging import verbose_router_logger
 from litellm.llms.azure.azure import get_azure_ad_token_from_oidc
 from litellm.llms.azure.common_utils import (
    get_azure_ad_token_from_entrata_id,
    get_azure_ad_token_from_username_password,
 )
 from litellm.secret_managers.get_azure_ad_token_provider import (
    get_azure_ad_token_provider,
 )
 from litellm.utils import calculate_max_parallel_requests
 if TYPE_CHECKING:
@ -26,46 +11,13 @@ else:
    LitellmRouter = Any
-class InitalizeOpenAISDKClient:
+class InitalizeCachedClient:
    @staticmethod
-    def should_initialize_sync_client(
+    def set_max_parallel_requests_client(
        litellm_router_instance: LitellmRouter,
    ) -> bool:
        """
        Returns if Sync OpenAI, Azure Clients should be initialized.
        Do not init sync clients when router.router_general_settings.async_only_mode is True
        """
        if litellm_router_instance is None:
            return False
        if litellm_router_instance.router_general_settings is not None:
            if (
                hasattr(litellm_router_instance, "router_general_settings")
                and hasattr(
                    litellm_router_instance.router_general_settings, "async_only_mode"
                )
                and litellm_router_instance.router_general_settings.async_only_mode
                is True
            ):
                return False
        return True
    @staticmethod
    def set_client(  # noqa: PLR0915
        litellm_router_instance: LitellmRouter, model: dict
    ):
        """
        - Initializes Azure/OpenAI clients. Stores them in cache, b/c of this - https://github.com/BerriAI/litellm/issues/1278
        - Initializes Semaphore for client w/ rpm. Stores them in cache. b/c of this - https://github.com/BerriAI/litellm/issues/2994
        """
        client_ttl = litellm_router_instance.client_ttl
        litellm_params = model.get("litellm_params", {})
        model_name = litellm_params.get("model")
        model_id = model["model_info"]["id"]
        # ### IF RPM SET - initialize a semaphore ###
        rpm = litellm_params.get("rpm", None)
        tpm = litellm_params.get("tpm", None)
        max_parallel_requests = litellm_params.get("max_parallel_requests", None)
@ -83,480 +35,3 @@ class InitalizeOpenAISDKClient:
                value=semaphore,
                local_only=True,
            )
        ####  for OpenAI / Azure we need to initalize the Client for High Traffic ########
        custom_llm_provider = litellm_params.get("custom_llm_provider")
        custom_llm_provider = custom_llm_provider or model_name.split("/", 1)[0] or ""
        default_api_base = None
        default_api_key = None
        if custom_llm_provider in litellm.openai_compatible_providers:
            _, custom_llm_provider, api_key, api_base = litellm.get_llm_provider(
                model=model_name
            )
            default_api_base = api_base
            default_api_key = api_key
        if (
            model_name in litellm.open_ai_chat_completion_models
            or custom_llm_provider in litellm.openai_compatible_providers
            or custom_llm_provider == "azure"
            or custom_llm_provider == "azure_text"
            or custom_llm_provider == "custom_openai"
            or custom_llm_provider == "openai"
            or custom_llm_provider == "text-completion-openai"
            or "ft:gpt-3.5-turbo" in model_name
            or model_name in litellm.open_ai_embedding_models
        ):
            is_azure_ai_studio_model: bool = False
            if custom_llm_provider == "azure":
                if litellm.utils._is_non_openai_azure_model(model_name):
                    is_azure_ai_studio_model = True
                    custom_llm_provider = "openai"
                    # remove azure prefx from model_name
                    model_name = model_name.replace("azure/", "")
            # glorified / complicated reading of configs
            # user can pass vars directly or they can pas os.environ/AZURE_API_KEY, in which case we will read the env
            # we do this here because we init clients for Azure, OpenAI and we need to set the right key
            api_key = litellm_params.get("api_key") or default_api_key
            if (
                api_key
                and isinstance(api_key, str)
                and api_key.startswith("os.environ/")
            ):
                api_key_env_name = api_key.replace("os.environ/", "")
                api_key = get_secret_str(api_key_env_name)
                litellm_params["api_key"] = api_key
            api_base = litellm_params.get("api_base")
            base_url: Optional[str] = litellm_params.get("base_url")
            api_base = (
                api_base or base_url or default_api_base
            )  # allow users to pass in `api_base` or `base_url` for azure
            if api_base and api_base.startswith("os.environ/"):
                api_base_env_name = api_base.replace("os.environ/", "")
                api_base = get_secret_str(api_base_env_name)
                litellm_params["api_base"] = api_base
            ## AZURE AI STUDIO MISTRAL CHECK ##
            """
            Make sure api base ends in /v1/
            if not, add it - https://github.com/BerriAI/litellm/issues/2279
            """
            if (
                is_azure_ai_studio_model is True
                and api_base is not None
                and isinstance(api_base, str)
                and not api_base.endswith("/v1/")
            ):
                # check if it ends with a trailing slash
                if api_base.endswith("/"):
                    api_base += "v1/"
                elif api_base.endswith("/v1"):
                    api_base += "/"
                else:
                    api_base += "/v1/"
            api_version = litellm_params.get("api_version")
            if api_version and api_version.startswith("os.environ/"):
                api_version_env_name = api_version.replace("os.environ/", "")
                api_version = get_secret_str(api_version_env_name)
                litellm_params["api_version"] = api_version
            timeout: Optional[float] = (
                litellm_params.pop("timeout", None) or litellm.request_timeout
            )
            if isinstance(timeout, str) and timeout.startswith("os.environ/"):
                timeout_env_name = timeout.replace("os.environ/", "")
                timeout = get_secret(timeout_env_name)  # type: ignore
                litellm_params["timeout"] = timeout
            stream_timeout: Optional[float] = litellm_params.pop(
                "stream_timeout", timeout
            )  # if no stream_timeout is set, default to timeout
            if isinstance(stream_timeout, str) and stream_timeout.startswith(
                "os.environ/"
            ):
                stream_timeout_env_name = stream_timeout.replace("os.environ/", "")
                stream_timeout = get_secret(stream_timeout_env_name)  # type: ignore
                litellm_params["stream_timeout"] = stream_timeout
            max_retries: Optional[int] = litellm_params.pop(
                "max_retries", 0
            )  # router handles retry logic
            if isinstance(max_retries, str) and max_retries.startswith("os.environ/"):
                max_retries_env_name = max_retries.replace("os.environ/", "")
                max_retries = get_secret(max_retries_env_name)  # type: ignore
                litellm_params["max_retries"] = max_retries
            organization = litellm_params.get("organization", None)
            if isinstance(organization, str) and organization.startswith("os.environ/"):
                organization_env_name = organization.replace("os.environ/", "")
                organization = get_secret_str(organization_env_name)
                litellm_params["organization"] = organization
            azure_ad_token_provider: Optional[Callable[[], str]] = None
            # If we have api_key, then we have higher priority
            if not api_key and litellm_params.get("tenant_id"):
                verbose_router_logger.debug(
                    "Using Azure AD Token Provider for Azure Auth"
                )
                azure_ad_token_provider = get_azure_ad_token_from_entrata_id(
                    tenant_id=litellm_params.get("tenant_id"),
                    client_id=litellm_params.get("client_id"),
                    client_secret=litellm_params.get("client_secret"),
                )
            if litellm_params.get("azure_username") and litellm_params.get(
                "azure_password"
            ):
                azure_ad_token_provider = get_azure_ad_token_from_username_password(
                    azure_username=litellm_params.get("azure_username"),
                    azure_password=litellm_params.get("azure_password"),
                    client_id=litellm_params.get("client_id"),
                )
            if custom_llm_provider == "azure" or custom_llm_provider == "azure_text":
                if api_base is None or not isinstance(api_base, str):
                    filtered_litellm_params = {
                        k: v
                        for k, v in model["litellm_params"].items()
                        if k != "api_key"
                    }
                    _filtered_model = {
                        "model_name": model["model_name"],
                        "litellm_params": filtered_litellm_params,
                    }
                    raise ValueError(
                        f"api_base is required for Azure OpenAI. Set it on your config. Model - {_filtered_model}"
                    )
                azure_ad_token = litellm_params.get("azure_ad_token")
                if azure_ad_token is not None:
                    if azure_ad_token.startswith("oidc/"):
                        azure_ad_token = get_azure_ad_token_from_oidc(azure_ad_token)
                elif (
                     not api_key and azure_ad_token_provider is None
                    and litellm.enable_azure_ad_token_refresh is True
                ):
                    try:
                        azure_ad_token_provider = get_azure_ad_token_provider()
                    except ValueError:
                        verbose_router_logger.debug(
                            "Azure AD Token Provider could not be used."
                        )
                if api_version is None:
                    api_version = os.getenv(
                        "AZURE_API_VERSION", litellm.AZURE_DEFAULT_API_VERSION
                    )
                if "gateway.ai.cloudflare.com" in api_base:
                    if not api_base.endswith("/"):
                        api_base += "/"
                    azure_model = model_name.replace("azure/", "")
                    api_base += f"{azure_model}"
                    cache_key = f"{model_id}_async_client"
                    _client = openai.AsyncAzureOpenAI(
                        api_key=api_key,
                        azure_ad_token=azure_ad_token,
                        azure_ad_token_provider=azure_ad_token_provider,
                        base_url=api_base,
                        api_version=api_version,
                        timeout=timeout,  # type: ignore
                        max_retries=max_retries,  # type: ignore
                        http_client=httpx.AsyncClient(
                            limits=httpx.Limits(
                                max_connections=1000, max_keepalive_connections=100
                            ),
                            verify=litellm.ssl_verify,
                        ),  # type: ignore
                    )
                    litellm_router_instance.cache.set_cache(
                        key=cache_key,
                        value=_client,
                        ttl=client_ttl,
                        local_only=True,
                    )  # cache for 1 hr
                    if InitalizeOpenAISDKClient.should_initialize_sync_client(
                        litellm_router_instance=litellm_router_instance
                    ):
                        cache_key = f"{model_id}_client"
                        _client = openai.AzureOpenAI(  # type: ignore
                            api_key=api_key,
                            azure_ad_token=azure_ad_token,
                            azure_ad_token_provider=azure_ad_token_provider,
                            base_url=api_base,
                            api_version=api_version,
                            timeout=timeout,  # type: ignore
                            max_retries=max_retries,  # type: ignore
                            http_client=httpx.Client(
                                limits=httpx.Limits(
                                    max_connections=1000, max_keepalive_connections=100
                                ),
                                verify=litellm.ssl_verify,
                            ),  # type: ignore
                        )
                        litellm_router_instance.cache.set_cache(
                            key=cache_key,
                            value=_client,
                            ttl=client_ttl,
                            local_only=True,
                        )  # cache for 1 hr
                    # streaming clients can have diff timeouts
                    cache_key = f"{model_id}_stream_async_client"
                    _client = openai.AsyncAzureOpenAI(  # type: ignore
                        api_key=api_key,
                        azure_ad_token=azure_ad_token,
                        azure_ad_token_provider=azure_ad_token_provider,
                        base_url=api_base,
                        api_version=api_version,
                        timeout=stream_timeout,  # type: ignore
                        max_retries=max_retries,  # type: ignore
                        http_client=httpx.AsyncClient(
                            limits=httpx.Limits(
                                max_connections=1000, max_keepalive_connections=100
                            ),
                            verify=litellm.ssl_verify,
                        ),  # type: ignore
                    )
                    litellm_router_instance.cache.set_cache(
                        key=cache_key,
                        value=_client,
                        ttl=client_ttl,
                        local_only=True,
                    )  # cache for 1 hr
                    if InitalizeOpenAISDKClient.should_initialize_sync_client(
                        litellm_router_instance=litellm_router_instance
                    ):
                        cache_key = f"{model_id}_stream_client"
                        _client = openai.AzureOpenAI(  # type: ignore
                            api_key=api_key,
                            azure_ad_token=azure_ad_token,
                            azure_ad_token_provider=azure_ad_token_provider,
                            base_url=api_base,
                            api_version=api_version,
                            timeout=stream_timeout,  # type: ignore
                            max_retries=max_retries,  # type: ignore
                            http_client=httpx.Client(
                                limits=httpx.Limits(
                                    max_connections=1000, max_keepalive_connections=100
                                ),
                                verify=litellm.ssl_verify,
                            ),  # type: ignore
                        )
                        litellm_router_instance.cache.set_cache(
                            key=cache_key,
                            value=_client,
                            ttl=client_ttl,
                            local_only=True,
                        )  # cache for 1 hr
                else:
                    _api_key = api_key
                    if _api_key is not None and isinstance(_api_key, str):
                        # only show first 5 chars of api_key
                        _api_key = _api_key[:8] + "*" * 15
                    verbose_router_logger.debug(
                        f"Initializing Azure OpenAI Client for {model_name}, Api Base: {str(api_base)}, Api Key:{_api_key}"
                    )
                    azure_client_params = {
                        "api_key": api_key,
                        "azure_endpoint": api_base,
                        "api_version": api_version,
                        "azure_ad_token": azure_ad_token,
                        "azure_ad_token_provider": azure_ad_token_provider,
                    }
                    if azure_ad_token_provider is not None:
                        azure_client_params["azure_ad_token_provider"] = (
                            azure_ad_token_provider
                        )
                    from litellm.llms.azure.azure import (
                        select_azure_base_url_or_endpoint,
                    )
                    # this decides if we should set azure_endpoint or base_url on Azure OpenAI Client
                    # required to support GPT-4 vision enhancements, since base_url needs to be set on Azure OpenAI Client
                    azure_client_params = select_azure_base_url_or_endpoint(
                        azure_client_params
                    )
                    cache_key = f"{model_id}_async_client"
                    _client = openai.AsyncAzureOpenAI(  # type: ignore
                        **azure_client_params,
                        timeout=timeout,  # type: ignore
                        max_retries=max_retries,  # type: ignore
                        http_client=httpx.AsyncClient(
                            limits=httpx.Limits(
                                max_connections=1000, max_keepalive_connections=100
                            ),
                            verify=litellm.ssl_verify,
                        ),  # type: ignore
                    )
                    litellm_router_instance.cache.set_cache(
                        key=cache_key,
                        value=_client,
                        ttl=client_ttl,
                        local_only=True,
                    )  # cache for 1 hr
                    if InitalizeOpenAISDKClient.should_initialize_sync_client(
                        litellm_router_instance=litellm_router_instance
                    ):
                        cache_key = f"{model_id}_client"
                        _client = openai.AzureOpenAI(  # type: ignore
                            **azure_client_params,
                            timeout=timeout,  # type: ignore
                            max_retries=max_retries,  # type: ignore
                            http_client=httpx.Client(
                                limits=httpx.Limits(
                                    max_connections=1000, max_keepalive_connections=100
                                ),
                                verify=litellm.ssl_verify,
                            ),  # type: ignore
                        )
                        litellm_router_instance.cache.set_cache(
                            key=cache_key,
                            value=_client,
                            ttl=client_ttl,
                            local_only=True,
                        )  # cache for 1 hr
                    # streaming clients should have diff timeouts
                    cache_key = f"{model_id}_stream_async_client"
                    _client = openai.AsyncAzureOpenAI(  # type: ignore
                        **azure_client_params,
                        timeout=stream_timeout,  # type: ignore
                        max_retries=max_retries,  # type: ignore
                        http_client=httpx.AsyncClient(
                            limits=httpx.Limits(
                                max_connections=1000, max_keepalive_connections=100
                            ),
                            verify=litellm.ssl_verify,
                        ),
                    )
                    litellm_router_instance.cache.set_cache(
                        key=cache_key,
                        value=_client,
                        ttl=client_ttl,
                        local_only=True,
                    )  # cache for 1 hr
                    if InitalizeOpenAISDKClient.should_initialize_sync_client(
                        litellm_router_instance=litellm_router_instance
                    ):
                        cache_key = f"{model_id}_stream_client"
                        _client = openai.AzureOpenAI(  # type: ignore
                            **azure_client_params,
                            timeout=stream_timeout,  # type: ignore
                            max_retries=max_retries,  # type: ignore
                            http_client=httpx.Client(
                                limits=httpx.Limits(
                                    max_connections=1000, max_keepalive_connections=100
                                ),
                                verify=litellm.ssl_verify,
                            ),
                        )
                        litellm_router_instance.cache.set_cache(
                            key=cache_key,
                            value=_client,
                            ttl=client_ttl,
                            local_only=True,
                        )  # cache for 1 hr
            else:
                _api_key = api_key  # type: ignore
                if _api_key is not None and isinstance(_api_key, str):
                    # only show first 5 chars of api_key
                    _api_key = _api_key[:8] + "*" * 15
                verbose_router_logger.debug(
                    f"Initializing OpenAI Client for {model_name}, Api Base:{str(api_base)}, Api Key:{_api_key}"
                )
                cache_key = f"{model_id}_async_client"
                _client = openai.AsyncOpenAI(  # type: ignore
                    api_key=api_key,
                    base_url=api_base,
                    timeout=timeout,  # type: ignore
                    max_retries=max_retries,  # type: ignore
                    organization=organization,
                    http_client=httpx.AsyncClient(
                        limits=httpx.Limits(
                            max_connections=1000, max_keepalive_connections=100
                        ),
                        verify=litellm.ssl_verify,
                    ),  # type: ignore
                )
                litellm_router_instance.cache.set_cache(
                    key=cache_key,
                    value=_client,
                    ttl=client_ttl,
                    local_only=True,
                )  # cache for 1 hr
                if InitalizeOpenAISDKClient.should_initialize_sync_client(
                    litellm_router_instance=litellm_router_instance
                ):
                    cache_key = f"{model_id}_client"
                    _client = openai.OpenAI(  # type: ignore
                        api_key=api_key,
                        base_url=api_base,
                        timeout=timeout,  # type: ignore
                        max_retries=max_retries,  # type: ignore
                        organization=organization,
                        http_client=httpx.Client(
                            limits=httpx.Limits(
                                max_connections=1000, max_keepalive_connections=100
                            ),
                            verify=litellm.ssl_verify,
                        ),  # type: ignore
                    )
                    litellm_router_instance.cache.set_cache(
                        key=cache_key,
                        value=_client,
                        ttl=client_ttl,
                        local_only=True,
                    )  # cache for 1 hr
                # streaming clients should have diff timeouts
                cache_key = f"{model_id}_stream_async_client"
                _client = openai.AsyncOpenAI(  # type: ignore
                    api_key=api_key,
                    base_url=api_base,
                    timeout=stream_timeout,  # type: ignore
                    max_retries=max_retries,  # type: ignore
                    organization=organization,
                    http_client=httpx.AsyncClient(
                        limits=httpx.Limits(
                            max_connections=1000, max_keepalive_connections=100
                        ),
                        verify=litellm.ssl_verify,
                    ),  # type: ignore
                )
                litellm_router_instance.cache.set_cache(
                    key=cache_key,
                    value=_client,
                    ttl=client_ttl,
                    local_only=True,
                )  # cache for 1 hr
                if InitalizeOpenAISDKClient.should_initialize_sync_client(
                    litellm_router_instance=litellm_router_instance
                ):
                    # streaming clients should have diff timeouts
                    cache_key = f"{model_id}_stream_client"
                    _client = openai.OpenAI(  # type: ignore
                        api_key=api_key,
                        base_url=api_base,
                        timeout=stream_timeout,  # type: ignore
                        max_retries=max_retries,  # type: ignore
                        organization=organization,
                        http_client=httpx.Client(
                            limits=httpx.Limits(
                                max_connections=1000, max_keepalive_connections=100
                            ),
                            verify=litellm.ssl_verify,
                        ),  # type: ignore
                    )
                    litellm_router_instance.cache.set_cache(
                        key=cache_key,
                        value=_client,
                        ttl=client_ttl,
                        local_only=True,
                    )  # cache for 1 hr
--- a/litellm/types/llms/openai.py
+++ b/litellm/types/llms/openai.py
@ -1,6 +1,8 @@
 from enum import Enum
 from os import PathLike
 from typing import IO, Any, Iterable, List, Literal, Mapping, Optional, Tuple, Union
 import httpx
 from openai._legacy_response import (
    HttpxBinaryResponseContent as _HttpxBinaryResponseContent,
 )
@ -31,8 +33,24 @@ from openai.types.chat.chat_completion_prediction_content_param import (
 )
 from openai.types.embedding import Embedding as OpenAIEmbedding
 from openai.types.fine_tuning.fine_tuning_job import FineTuningJob
-from pydantic import BaseModel, Field
+from openai.types.responses.response import (
-from typing_extensions import Dict, Required, TypedDict, override
+    IncompleteDetails,
    Response,
    ResponseOutputItem,
    ResponseTextConfig,
    Tool,
    ToolChoice,
 )
 from openai.types.responses.response_create_params import (
    Reasoning,
    ResponseIncludable,
    ResponseInputParam,
    ResponseTextConfigParam,
    ToolChoice,
    ToolParam,
 )
 from pydantic import BaseModel, Discriminator, Field, PrivateAttr
 from typing_extensions import Annotated, Dict, Required, TypedDict, override
 FileContent = Union[IO[bytes], bytes, PathLike]
@ -684,3 +702,326 @@ OpenAIAudioTranscriptionOptionalParams = Literal[
 OpenAIImageVariationOptionalParams = Literal["n", "size", "response_format", "user"]
 class ResponsesAPIOptionalRequestParams(TypedDict, total=False):
    """TypedDict for Optional parameters supported by the responses API."""
    include: Optional[List[ResponseIncludable]]
    instructions: Optional[str]
    max_output_tokens: Optional[int]
    metadata: Optional[Dict[str, Any]]
    parallel_tool_calls: Optional[bool]
    previous_response_id: Optional[str]
    reasoning: Optional[Reasoning]
    store: Optional[bool]
    stream: Optional[bool]
    temperature: Optional[float]
    text: Optional[ResponseTextConfigParam]
    tool_choice: Optional[ToolChoice]
    tools: Optional[Iterable[ToolParam]]
    top_p: Optional[float]
    truncation: Optional[Literal["auto", "disabled"]]
    user: Optional[str]
 class ResponsesAPIRequestParams(ResponsesAPIOptionalRequestParams, total=False):
    """TypedDict for request parameters supported by the responses API."""
    input: Union[str, ResponseInputParam]
    model: str
 class BaseLiteLLMOpenAIResponseObject(BaseModel):
    def __getitem__(self, key):
        return self.__dict__[key]
    def get(self, key, default=None):
        return self.__dict__.get(key, default)
    def __contains__(self, key):
        return key in self.__dict__
    def items(self):
        return self.__dict__.items()
 class OutputTokensDetails(BaseLiteLLMOpenAIResponseObject):
    reasoning_tokens: int
    model_config = {"extra": "allow"}
 class ResponseAPIUsage(BaseLiteLLMOpenAIResponseObject):
    input_tokens: int
    """The number of input tokens."""
    output_tokens: int
    """The number of output tokens."""
    output_tokens_details: Optional[OutputTokensDetails]
    """A detailed breakdown of the output tokens."""
    total_tokens: int
    """The total number of tokens used."""
    model_config = {"extra": "allow"}
 class ResponsesAPIResponse(BaseLiteLLMOpenAIResponseObject):
    id: str
    created_at: float
    error: Optional[dict]
    incomplete_details: Optional[IncompleteDetails]
    instructions: Optional[str]
    metadata: Optional[Dict]
    model: Optional[str]
    object: Optional[str]
    output: List[ResponseOutputItem]
    parallel_tool_calls: bool
    temperature: Optional[float]
    tool_choice: ToolChoice
    tools: List[Tool]
    top_p: Optional[float]
    max_output_tokens: Optional[int]
    previous_response_id: Optional[str]
    reasoning: Optional[Reasoning]
    status: Optional[str]
    text: Optional[ResponseTextConfig]
    truncation: Optional[Literal["auto", "disabled"]]
    usage: Optional[ResponseAPIUsage]
    user: Optional[str]
    # Define private attributes using PrivateAttr
    _hidden_params: dict = PrivateAttr(default_factory=dict)
 class ResponsesAPIStreamEvents(str, Enum):
    """
    Enum representing all supported OpenAI stream event types for the Responses API.
    Inherits from str to allow direct string comparison and usage as dictionary keys.
    """
    # Response lifecycle events
    RESPONSE_CREATED = "response.created"
    RESPONSE_IN_PROGRESS = "response.in_progress"
    RESPONSE_COMPLETED = "response.completed"
    RESPONSE_FAILED = "response.failed"
    RESPONSE_INCOMPLETE = "response.incomplete"
    # Output item events
    OUTPUT_ITEM_ADDED = "response.output_item.added"
    OUTPUT_ITEM_DONE = "response.output_item.done"
    # Content part events
    CONTENT_PART_ADDED = "response.content_part.added"
    CONTENT_PART_DONE = "response.content_part.done"
    # Output text events
    OUTPUT_TEXT_DELTA = "response.output_text.delta"
    OUTPUT_TEXT_ANNOTATION_ADDED = "response.output_text.annotation.added"
    OUTPUT_TEXT_DONE = "response.output_text.done"
    # Refusal events
    REFUSAL_DELTA = "response.refusal.delta"
    REFUSAL_DONE = "response.refusal.done"
    # Function call events
    FUNCTION_CALL_ARGUMENTS_DELTA = "response.function_call_arguments.delta"
    FUNCTION_CALL_ARGUMENTS_DONE = "response.function_call_arguments.done"
    # File search events
    FILE_SEARCH_CALL_IN_PROGRESS = "response.file_search_call.in_progress"
    FILE_SEARCH_CALL_SEARCHING = "response.file_search_call.searching"
    FILE_SEARCH_CALL_COMPLETED = "response.file_search_call.completed"
    # Web search events
    WEB_SEARCH_CALL_IN_PROGRESS = "response.web_search_call.in_progress"
    WEB_SEARCH_CALL_SEARCHING = "response.web_search_call.searching"
    WEB_SEARCH_CALL_COMPLETED = "response.web_search_call.completed"
    # Error event
    ERROR = "error"
 class ResponseCreatedEvent(BaseLiteLLMOpenAIResponseObject):
    type: Literal[ResponsesAPIStreamEvents.RESPONSE_CREATED]
    response: ResponsesAPIResponse
 class ResponseInProgressEvent(BaseLiteLLMOpenAIResponseObject):
    type: Literal[ResponsesAPIStreamEvents.RESPONSE_IN_PROGRESS]
    response: ResponsesAPIResponse
 class ResponseCompletedEvent(BaseLiteLLMOpenAIResponseObject):
    type: Literal[ResponsesAPIStreamEvents.RESPONSE_COMPLETED]
    response: ResponsesAPIResponse
    _hidden_params: dict = PrivateAttr(default_factory=dict)
 class ResponseFailedEvent(BaseLiteLLMOpenAIResponseObject):
    type: Literal[ResponsesAPIStreamEvents.RESPONSE_FAILED]
    response: ResponsesAPIResponse
 class ResponseIncompleteEvent(BaseLiteLLMOpenAIResponseObject):
    type: Literal[ResponsesAPIStreamEvents.RESPONSE_INCOMPLETE]
    response: ResponsesAPIResponse
 class OutputItemAddedEvent(BaseLiteLLMOpenAIResponseObject):
    type: Literal[ResponsesAPIStreamEvents.OUTPUT_ITEM_ADDED]
    output_index: int
    item: dict
 class OutputItemDoneEvent(BaseLiteLLMOpenAIResponseObject):
    type: Literal[ResponsesAPIStreamEvents.OUTPUT_ITEM_DONE]
    output_index: int
    item: dict
 class ContentPartAddedEvent(BaseLiteLLMOpenAIResponseObject):
    type: Literal[ResponsesAPIStreamEvents.CONTENT_PART_ADDED]
    item_id: str
    output_index: int
    content_index: int
    part: dict
 class ContentPartDoneEvent(BaseLiteLLMOpenAIResponseObject):
    type: Literal[ResponsesAPIStreamEvents.CONTENT_PART_DONE]
    item_id: str
    output_index: int
    content_index: int
    part: dict
 class OutputTextDeltaEvent(BaseLiteLLMOpenAIResponseObject):
    type: Literal[ResponsesAPIStreamEvents.OUTPUT_TEXT_DELTA]
    item_id: str
    output_index: int
    content_index: int
    delta: str
 class OutputTextAnnotationAddedEvent(BaseLiteLLMOpenAIResponseObject):
    type: Literal[ResponsesAPIStreamEvents.OUTPUT_TEXT_ANNOTATION_ADDED]
    item_id: str
    output_index: int
    content_index: int
    annotation_index: int
    annotation: dict
 class OutputTextDoneEvent(BaseLiteLLMOpenAIResponseObject):
    type: Literal[ResponsesAPIStreamEvents.OUTPUT_TEXT_DONE]
    item_id: str
    output_index: int
    content_index: int
    text: str
 class RefusalDeltaEvent(BaseLiteLLMOpenAIResponseObject):
    type: Literal[ResponsesAPIStreamEvents.REFUSAL_DELTA]
    item_id: str
    output_index: int
    content_index: int
    delta: str
 class RefusalDoneEvent(BaseLiteLLMOpenAIResponseObject):
    type: Literal[ResponsesAPIStreamEvents.REFUSAL_DONE]
    item_id: str
    output_index: int
    content_index: int
    refusal: str
 class FunctionCallArgumentsDeltaEvent(BaseLiteLLMOpenAIResponseObject):
    type: Literal[ResponsesAPIStreamEvents.FUNCTION_CALL_ARGUMENTS_DELTA]
    item_id: str
    output_index: int
    delta: str
 class FunctionCallArgumentsDoneEvent(BaseLiteLLMOpenAIResponseObject):
    type: Literal[ResponsesAPIStreamEvents.FUNCTION_CALL_ARGUMENTS_DONE]
    item_id: str
    output_index: int
    arguments: str
 class FileSearchCallInProgressEvent(BaseLiteLLMOpenAIResponseObject):
    type: Literal[ResponsesAPIStreamEvents.FILE_SEARCH_CALL_IN_PROGRESS]
    output_index: int
    item_id: str
 class FileSearchCallSearchingEvent(BaseLiteLLMOpenAIResponseObject):
    type: Literal[ResponsesAPIStreamEvents.FILE_SEARCH_CALL_SEARCHING]
    output_index: int
    item_id: str
 class FileSearchCallCompletedEvent(BaseLiteLLMOpenAIResponseObject):
    type: Literal[ResponsesAPIStreamEvents.FILE_SEARCH_CALL_COMPLETED]
    output_index: int
    item_id: str
 class WebSearchCallInProgressEvent(BaseLiteLLMOpenAIResponseObject):
    type: Literal[ResponsesAPIStreamEvents.WEB_SEARCH_CALL_IN_PROGRESS]
    output_index: int
    item_id: str
 class WebSearchCallSearchingEvent(BaseLiteLLMOpenAIResponseObject):
    type: Literal[ResponsesAPIStreamEvents.WEB_SEARCH_CALL_SEARCHING]
    output_index: int
    item_id: str
 class WebSearchCallCompletedEvent(BaseLiteLLMOpenAIResponseObject):
    type: Literal[ResponsesAPIStreamEvents.WEB_SEARCH_CALL_COMPLETED]
    output_index: int
    item_id: str
 class ErrorEvent(BaseLiteLLMOpenAIResponseObject):
    type: Literal[ResponsesAPIStreamEvents.ERROR]
    code: Optional[str]
    message: str
    param: Optional[str]
 # Union type for all possible streaming responses
 ResponsesAPIStreamingResponse = Annotated[
    Union[
        ResponseCreatedEvent,
        ResponseInProgressEvent,
        ResponseCompletedEvent,
        ResponseFailedEvent,
        ResponseIncompleteEvent,
        OutputItemAddedEvent,
        OutputItemDoneEvent,
        ContentPartAddedEvent,
        ContentPartDoneEvent,
        OutputTextDeltaEvent,
        OutputTextAnnotationAddedEvent,
        OutputTextDoneEvent,
        RefusalDeltaEvent,
        RefusalDoneEvent,
        FunctionCallArgumentsDeltaEvent,
        FunctionCallArgumentsDoneEvent,
        FileSearchCallInProgressEvent,
        FileSearchCallSearchingEvent,
        FileSearchCallCompletedEvent,
        WebSearchCallInProgressEvent,
        WebSearchCallSearchingEvent,
        WebSearchCallCompletedEvent,
        ErrorEvent,
    ],
    Discriminator("type"),
 ]
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@ -191,6 +191,44 @@ class CallTypes(Enum):
    retrieve_batch = "retrieve_batch"
    pass_through = "pass_through_endpoint"
    anthropic_messages = "anthropic_messages"
    get_assistants = "get_assistants"
    aget_assistants = "aget_assistants"
    create_assistants = "create_assistants"
    acreate_assistants = "acreate_assistants"
    delete_assistant = "delete_assistant"
    adelete_assistant = "adelete_assistant"
    acreate_thread = "acreate_thread"
    create_thread = "create_thread"
    aget_thread = "aget_thread"
    get_thread = "get_thread"
    a_add_message = "a_add_message"
    add_message = "add_message"
    aget_messages = "aget_messages"
    get_messages = "get_messages"
    arun_thread = "arun_thread"
    run_thread = "run_thread"
    arun_thread_stream = "arun_thread_stream"
    run_thread_stream = "run_thread_stream"
    afile_retrieve = "afile_retrieve"
    file_retrieve = "file_retrieve"
    afile_delete = "afile_delete"
    file_delete = "file_delete"
    afile_list = "afile_list"
    file_list = "file_list"
    acreate_file = "acreate_file"
    create_file = "create_file"
    afile_content = "afile_content"
    file_content = "file_content"
    create_fine_tuning_job = "create_fine_tuning_job"
    acreate_fine_tuning_job = "acreate_fine_tuning_job"
    acancel_fine_tuning_job = "acancel_fine_tuning_job"
    cancel_fine_tuning_job = "cancel_fine_tuning_job"
    alist_fine_tuning_jobs = "alist_fine_tuning_jobs"
    list_fine_tuning_jobs = "list_fine_tuning_jobs"
    aretrieve_fine_tuning_job = "aretrieve_fine_tuning_job"
    retrieve_fine_tuning_job = "retrieve_fine_tuning_job"
    responses = "responses"
    aresponses = "aresponses"
 CallTypesLiteral = Literal[
@ -1815,6 +1853,7 @@ all_litellm_params = [
    "budget_duration",
    "use_in_pass_through",
    "merge_reasoning_content_in_choices",
    "litellm_credential_name",
 ] + list(StandardCallbackDynamicParams.__annotations__.keys())
@ -2011,3 +2050,9 @@ class RawRequestTypedDict(TypedDict, total=False):
    raw_request_body: Optional[dict]
    raw_request_headers: Optional[dict]
    error: Optional[str]
 class CredentialItem(BaseModel):
    credential_name: str
    credential_values: dict
    credential_info: dict
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -66,6 +66,7 @@ from litellm.litellm_core_utils.core_helpers import (
    map_finish_reason,
    process_response_headers,
 )
 from litellm.litellm_core_utils.credential_accessor import CredentialAccessor
 from litellm.litellm_core_utils.default_encoding import encoding
 from litellm.litellm_core_utils.exception_mapping_utils import (
    _get_response_headers,
@ -141,6 +142,7 @@ from litellm.types.utils import (
    ChatCompletionMessageToolCall,
    Choices,
    CostPerToken,
    CredentialItem,
    CustomHuggingfaceTokenizer,
    Delta,
    Embedding,
@ -209,6 +211,7 @@ from litellm.llms.base_llm.image_variations.transformation import (
    BaseImageVariationConfig,
 )
 from litellm.llms.base_llm.rerank.transformation import BaseRerankConfig
 from litellm.llms.base_llm.responses.transformation import BaseResponsesAPIConfig
 from ._logging import _is_debugging_on, verbose_logger
 from .caching.caching import (
@ -455,6 +458,18 @@ def get_applied_guardrails(kwargs: Dict[str, Any]) -> List[str]:
    return applied_guardrails
 def load_credentials_from_list(kwargs: dict):
    """
    Updates kwargs with the credentials if credential_name in kwarg
    """
    credential_name = kwargs.get("litellm_credential_name")
    if credential_name and litellm.credential_list:
        credential_accessor = CredentialAccessor.get_credential_values(credential_name)
        for key, value in credential_accessor.items():
            if key not in kwargs:
                kwargs[key] = value
 def get_dynamic_callbacks(
    dynamic_callbacks: Optional[List[Union[str, Callable, CustomLogger]]]
 ) -> List:
@ -715,6 +730,11 @@ def function_setup(  # noqa: PLR0915
            call_type == CallTypes.aspeech.value or call_type == CallTypes.speech.value
        ):
            messages = kwargs.get("input", "speech")
        elif (
            call_type == CallTypes.aresponses.value
            or call_type == CallTypes.responses.value
        ):
            messages = args[0] if len(args) > 0 else kwargs["input"]
        else:
            messages = "default-message-value"
        stream = True if "stream" in kwargs and kwargs["stream"] is True else False
@ -983,6 +1003,8 @@ def client(original_function):  # noqa: PLR0915
                logging_obj, kwargs = function_setup(
                    original_function.__name__, rules_obj, start_time, *args, **kwargs
                )
            ## LOAD CREDENTIALS
            load_credentials_from_list(kwargs)
            kwargs["litellm_logging_obj"] = logging_obj
            _llm_caching_handler: LLMCachingHandler = LLMCachingHandler(
                original_function=original_function,
@ -1239,6 +1261,8 @@ def client(original_function):  # noqa: PLR0915
                    original_function.__name__, rules_obj, start_time, *args, **kwargs
                )
            kwargs["litellm_logging_obj"] = logging_obj
            ## LOAD CREDENTIALS
            load_credentials_from_list(kwargs)
            logging_obj._llm_caching_handler = _llm_caching_handler
            # [OPTIONAL] CHECK BUDGET
            if litellm.max_budget:
@ -5104,7 +5128,7 @@ def prompt_token_calculator(model, messages):
        from anthropic import AI_PROMPT, HUMAN_PROMPT, Anthropic
        anthropic_obj = Anthropic()
-        num_tokens = anthropic_obj.count_tokens(text)
+        num_tokens = anthropic_obj.count_tokens(text)  # type: ignore
    else:
        num_tokens = len(encoding.encode(text))
    return num_tokens
@ -6276,6 +6300,15 @@ class ProviderConfigManager:
            return litellm.DeepgramAudioTranscriptionConfig()
        return None
    @staticmethod
    def get_provider_responses_api_config(
        model: str,
        provider: LlmProviders,
    ) -> Optional[BaseResponsesAPIConfig]:
        if litellm.LlmProviders.OPENAI == provider:
            return litellm.OpenAIResponsesAPIConfig()
        return None
    @staticmethod
    def get_provider_text_completion_config(
        model: str,
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -2294,6 +2294,7 @@
        "output_cost_per_token": 0.0,
        "litellm_provider": "azure_ai",
        "mode": "embedding",
        "supports_embedding_image_input": true,
        "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/cohere.cohere-embed-v3-english-offer?tab=PlansAndPrice"
    },
    "azure_ai/Cohere-embed-v3-multilingual": {
@ -2304,6 +2305,7 @@
        "output_cost_per_token": 0.0,
        "litellm_provider": "azure_ai",
        "mode": "embedding",
        "supports_embedding_image_input": true,
        "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/cohere.cohere-embed-v3-english-offer?tab=PlansAndPrice"
    },
    "babbage-002": {
@ -4508,7 +4510,7 @@
    "gemini/gemini-2.0-flash-thinking-exp": {
        "max_tokens": 8192,
        "max_input_tokens": 1048576,
-        "max_output_tokens": 8192,
+        "max_output_tokens": 65536,
        "max_images_per_prompt": 3000,
        "max_videos_per_prompt": 10,
        "max_video_length": 1,
@ -4541,6 +4543,98 @@
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-2.0-flash",
        "supports_tool_choice": true
    },
    "gemini/gemini-2.0-flash-thinking-exp-01-21": {
        "max_tokens": 8192,
        "max_input_tokens": 1048576,
        "max_output_tokens": 65536,
        "max_images_per_prompt": 3000,
        "max_videos_per_prompt": 10,
        "max_video_length": 1,
        "max_audio_length_hours": 8.4,
        "max_audio_per_prompt": 1,
        "max_pdf_size_mb": 30,
        "input_cost_per_image": 0,
        "input_cost_per_video_per_second": 0,
        "input_cost_per_audio_per_second": 0,
        "input_cost_per_token": 0,
        "input_cost_per_character": 0, 
        "input_cost_per_token_above_128k_tokens": 0, 
        "input_cost_per_character_above_128k_tokens": 0, 
        "input_cost_per_image_above_128k_tokens": 0,
        "input_cost_per_video_per_second_above_128k_tokens": 0,
        "input_cost_per_audio_per_second_above_128k_tokens": 0,
        "output_cost_per_token": 0,
        "output_cost_per_character": 0,
        "output_cost_per_token_above_128k_tokens": 0,
        "output_cost_per_character_above_128k_tokens": 0,
        "litellm_provider": "gemini",
        "mode": "chat",
        "supports_system_messages": true,
        "supports_function_calling": true,
        "supports_vision": true,
        "supports_response_schema": true,
        "supports_audio_output": true,
        "tpm": 4000000,
        "rpm": 10,
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-2.0-flash",
        "supports_tool_choice": true
    },
    "gemini/gemma-3-27b-it": {
        "max_tokens": 8192,
        "max_input_tokens": 131072,
        "max_output_tokens": 8192,
        "input_cost_per_image": 0,
        "input_cost_per_video_per_second": 0,
        "input_cost_per_audio_per_second": 0,
        "input_cost_per_token": 0,
        "input_cost_per_character": 0, 
        "input_cost_per_token_above_128k_tokens": 0, 
        "input_cost_per_character_above_128k_tokens": 0, 
        "input_cost_per_image_above_128k_tokens": 0,
        "input_cost_per_video_per_second_above_128k_tokens": 0,
        "input_cost_per_audio_per_second_above_128k_tokens": 0,
        "output_cost_per_token": 0,
        "output_cost_per_character": 0,
        "output_cost_per_token_above_128k_tokens": 0,
        "output_cost_per_character_above_128k_tokens": 0,
        "litellm_provider": "gemini",
        "mode": "chat",
        "supports_system_messages": true,
        "supports_function_calling": true,
        "supports_vision": true,
        "supports_response_schema": true,
        "supports_audio_output": false,
        "source": "https://aistudio.google.com",
        "supports_tool_choice": true
    },
    "gemini/learnIm-1.5-pro-experimental": {
        "max_tokens": 8192,
        "max_input_tokens": 32767,
        "max_output_tokens": 8192,
        "input_cost_per_image": 0,
        "input_cost_per_video_per_second": 0,
        "input_cost_per_audio_per_second": 0,
        "input_cost_per_token": 0,
        "input_cost_per_character": 0, 
        "input_cost_per_token_above_128k_tokens": 0, 
        "input_cost_per_character_above_128k_tokens": 0, 
        "input_cost_per_image_above_128k_tokens": 0,
        "input_cost_per_video_per_second_above_128k_tokens": 0,
        "input_cost_per_audio_per_second_above_128k_tokens": 0,
        "output_cost_per_token": 0,
        "output_cost_per_character": 0,
        "output_cost_per_token_above_128k_tokens": 0,
        "output_cost_per_character_above_128k_tokens": 0,
        "litellm_provider": "gemini",
        "mode": "chat",
        "supports_system_messages": true,
        "supports_function_calling": true,
        "supports_vision": true,
        "supports_response_schema": true,
        "supports_audio_output": false,
        "source": "https://aistudio.google.com",
        "supports_tool_choice": true
    },
    "vertex_ai/claude-3-sonnet": {
        "max_tokens": 4096,
        "max_input_tokens": 200000,
@ -5708,6 +5802,7 @@
        "input_cost_per_token": 0.00000010,
        "output_cost_per_token": 0.00000,
        "litellm_provider": "cohere",
        "supports_embedding_image_input": true,
        "mode": "embedding"
    },
    "embed-english-v2.0": {
@ -7890,7 +7985,8 @@
        "input_cost_per_token": 0.0000001,
        "output_cost_per_token": 0.000000,
        "litellm_provider": "bedrock",                
-        "mode": "embedding"
+        "mode": "embedding",
        "supports_embedding_image_input": true
    },
    "cohere.embed-multilingual-v3": {
        "max_tokens": 512, 
@ -7898,7 +7994,8 @@
        "input_cost_per_token": 0.0000001,
        "output_cost_per_token": 0.000000,
        "litellm_provider": "bedrock",
-        "mode": "embedding"
+        "mode": "embedding",
        "supports_embedding_image_input": true
    },
    "us.deepseek.r1-v1:0": {
        "max_tokens": 4096, 
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.63.7"
+version = "1.63.8"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@ -21,7 +21,7 @@ Documentation = "https://docs.litellm.ai"
 [tool.poetry.dependencies]
 python = ">=3.8.1,<4.0, !=3.9.7"
 httpx = ">=0.23.0"
-openai = ">=1.61.0"
+openai = ">=1.66.1"
 python-dotenv = ">=0.2.0"
 tiktoken = ">=0.7.0"
 importlib-metadata = ">=6.8.0"
@ -96,7 +96,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"
 [tool.commitizen]
-version = "1.63.7"
+version = "1.63.8"
 version_files = [
    "pyproject.toml:^version"
 ]
--- a/requirements.txt
+++ b/requirements.txt
@ -1,7 +1,7 @@
 # LITELLM PROXY DEPENDENCIES #
 anyio==4.4.0 # openai + http req.
 httpx==0.27.0 # Pin Httpx dependency
-openai==1.61.0  # openai req. 
+openai==1.66.1  # openai req. 
 fastapi==0.115.5 # server dep
 backoff==2.2.1 # server dep
 pyyaml==6.0.2 # server dep
--- a/schema.prisma
+++ b/schema.prisma
@ -29,6 +29,18 @@ model LiteLLM_BudgetTable {
  organization_membership LiteLLM_OrganizationMembership[] // budgets of Users within a Organization 
 }
 // Models on proxy
 model LiteLLM_CredentialsTable {
  credential_id String @id @default(uuid())
  credential_name String @unique
  credential_values Json
  credential_info Json? 
  created_at    DateTime               @default(now()) @map("created_at")
  created_by String
  updated_at    DateTime               @default(now()) @updatedAt @map("updated_at")
  updated_by String
 }
 // Models on proxy
 model LiteLLM_ProxyModelTable {
  model_id String @id @default(uuid())
--- a/tests/litellm/llms/azure/test_azure_common_utils.py
+++ b/tests/litellm/llms/azure/test_azure_common_utils.py
@ -0,0 +1,457 @@
 import json
 import os
 import sys
 import traceback
 from typing import Callable, Optional
 from unittest.mock import MagicMock, patch
 import pytest
 sys.path.insert(
    0, os.path.abspath("../../../..")
 )  # Adds the parent directory to the system path
 import litellm
 from litellm.llms.azure.common_utils import BaseAzureLLM
 from litellm.types.utils import CallTypes
 # Mock the necessary dependencies
@pytest.fixture
 def setup_mocks():
    with patch(
        "litellm.llms.azure.common_utils.get_azure_ad_token_from_entrata_id"
    ) as mock_entrata_token, patch(
        "litellm.llms.azure.common_utils.get_azure_ad_token_from_username_password"
    ) as mock_username_password_token, patch(
        "litellm.llms.azure.common_utils.get_azure_ad_token_from_oidc"
    ) as mock_oidc_token, patch(
        "litellm.llms.azure.common_utils.get_azure_ad_token_provider"
    ) as mock_token_provider, patch(
        "litellm.llms.azure.common_utils.litellm"
    ) as mock_litellm, patch(
        "litellm.llms.azure.common_utils.verbose_logger"
    ) as mock_logger, patch(
        "litellm.llms.azure.common_utils.select_azure_base_url_or_endpoint"
    ) as mock_select_url:
        # Configure mocks
        mock_litellm.AZURE_DEFAULT_API_VERSION = "2023-05-15"
        mock_litellm.enable_azure_ad_token_refresh = False
        mock_entrata_token.return_value = lambda: "mock-entrata-token"
        mock_username_password_token.return_value = (
            lambda: "mock-username-password-token"
        )
        mock_oidc_token.return_value = "mock-oidc-token"
        mock_token_provider.return_value = lambda: "mock-default-token"
        mock_select_url.side_effect = (
            lambda azure_client_params, **kwargs: azure_client_params
        )
        yield {
            "entrata_token": mock_entrata_token,
            "username_password_token": mock_username_password_token,
            "oidc_token": mock_oidc_token,
            "token_provider": mock_token_provider,
            "litellm": mock_litellm,
            "logger": mock_logger,
            "select_url": mock_select_url,
        }
 def test_initialize_with_api_key(setup_mocks):
    # Test with api_key provided
    result = BaseAzureLLM().initialize_azure_sdk_client(
        litellm_params={},
        api_key="test-api-key",
        api_base="https://test.openai.azure.com",
        model_name="gpt-4",
        api_version="2023-06-01",
    )
    # Verify expected result
    assert result["api_key"] == "test-api-key"
    assert result["azure_endpoint"] == "https://test.openai.azure.com"
    assert result["api_version"] == "2023-06-01"
    assert "azure_ad_token" in result
    assert result["azure_ad_token"] is None
 def test_initialize_with_tenant_credentials(setup_mocks):
    # Test with tenant_id, client_id, and client_secret provided
    result = BaseAzureLLM().initialize_azure_sdk_client(
        litellm_params={
            "tenant_id": "test-tenant-id",
            "client_id": "test-client-id",
            "client_secret": "test-client-secret",
        },
        api_key=None,
        api_base="https://test.openai.azure.com",
        model_name="gpt-4",
        api_version=None,
    )
    # Verify that get_azure_ad_token_from_entrata_id was called
    setup_mocks["entrata_token"].assert_called_once_with(
        tenant_id="test-tenant-id",
        client_id="test-client-id",
        client_secret="test-client-secret",
    )
    # Verify expected result
    assert result["api_key"] is None
    assert result["azure_endpoint"] == "https://test.openai.azure.com"
    assert "azure_ad_token_provider" in result
 def test_initialize_with_username_password(setup_mocks):
    # Test with azure_username, azure_password, and client_id provided
    result = BaseAzureLLM().initialize_azure_sdk_client(
        litellm_params={
            "azure_username": "test-username",
            "azure_password": "test-password",
            "client_id": "test-client-id",
        },
        api_key=None,
        api_base="https://test.openai.azure.com",
        model_name="gpt-4",
        api_version=None,
    )
    # Verify that get_azure_ad_token_from_username_password was called
    setup_mocks["username_password_token"].assert_called_once_with(
        azure_username="test-username",
        azure_password="test-password",
        client_id="test-client-id",
    )
    # Verify expected result
    assert "azure_ad_token_provider" in result
 def test_initialize_with_oidc_token(setup_mocks):
    # Test with azure_ad_token that starts with "oidc/"
    result = BaseAzureLLM().initialize_azure_sdk_client(
        litellm_params={"azure_ad_token": "oidc/test-token"},
        api_key=None,
        api_base="https://test.openai.azure.com",
        model_name="gpt-4",
        api_version=None,
    )
    # Verify that get_azure_ad_token_from_oidc was called
    setup_mocks["oidc_token"].assert_called_once_with("oidc/test-token")
    # Verify expected result
    assert result["azure_ad_token"] == "mock-oidc-token"
 def test_initialize_with_enable_token_refresh(setup_mocks):
    # Enable token refresh
    setup_mocks["litellm"].enable_azure_ad_token_refresh = True
    # Test with token refresh enabled
    result = BaseAzureLLM().initialize_azure_sdk_client(
        litellm_params={},
        api_key=None,
        api_base="https://test.openai.azure.com",
        model_name="gpt-4",
        api_version=None,
    )
    # Verify that get_azure_ad_token_provider was called
    setup_mocks["token_provider"].assert_called_once()
    # Verify expected result
    assert "azure_ad_token_provider" in result
 def test_initialize_with_token_refresh_error(setup_mocks):
    # Enable token refresh but make it raise an error
    setup_mocks["litellm"].enable_azure_ad_token_refresh = True
    setup_mocks["token_provider"].side_effect = ValueError("Token provider error")
    # Test with token refresh enabled but raising error
    result = BaseAzureLLM().initialize_azure_sdk_client(
        litellm_params={},
        api_key=None,
        api_base="https://test.openai.azure.com",
        model_name="gpt-4",
        api_version=None,
    )
    # Verify error was logged
    setup_mocks["logger"].debug.assert_any_call(
        "Azure AD Token Provider could not be used."
    )
 def test_api_version_from_env_var(setup_mocks):
    # Test api_version from environment variable
    with patch.dict(os.environ, {"AZURE_API_VERSION": "2023-07-01"}):
        result = BaseAzureLLM().initialize_azure_sdk_client(
            litellm_params={},
            api_key="test-api-key",
            api_base="https://test.openai.azure.com",
            model_name="gpt-4",
            api_version=None,
        )
    # Verify expected result
    assert result["api_version"] == "2023-07-01"
 def test_select_azure_base_url_called(setup_mocks):
    # Test that select_azure_base_url_or_endpoint is called
    result = BaseAzureLLM().initialize_azure_sdk_client(
        litellm_params={},
        api_key="test-api-key",
        api_base="https://test.openai.azure.com",
        model_name="gpt-4",
        api_version="2023-06-01",
    )
    # Verify that select_azure_base_url_or_endpoint was called
    setup_mocks["select_url"].assert_called_once()
@pytest.mark.parametrize(
    "call_type",
    [
        call_type
        for call_type in CallTypes.__members__.values()
        if call_type.name.startswith("a")
        and call_type.name
        not in [
            "amoderation",
            "arerank",
            "arealtime",
            "anthropic_messages",
            "add_message",
            "arun_thread_stream",
            "aresponses",
        ]
    ],
 )
@pytest.mark.asyncio
 async def test_ensure_initialize_azure_sdk_client_always_used(call_type):
    from litellm.router import Router
    # Create a router with an Azure model
    azure_model_name = "azure/chatgpt-v-2"
    router = Router(
        model_list=[
            {
                "model_name": "gpt-3.5-turbo",
                "litellm_params": {
                    "model": azure_model_name,
                    "api_key": "test-api-key",
                    "api_version": os.getenv("AZURE_API_VERSION", "2023-05-15"),
                    "api_base": os.getenv(
                        "AZURE_API_BASE", "https://test.openai.azure.com"
                    ),
                },
            }
        ],
    )
    # Prepare test input based on call type
    test_inputs = {
        "acompletion": {
            "messages": [{"role": "user", "content": "Hello, how are you?"}]
        },
        "atext_completion": {"prompt": "Hello, how are you?"},
        "aimage_generation": {"prompt": "Hello, how are you?"},
        "aembedding": {"input": "Hello, how are you?"},
        "arerank": {"input": "Hello, how are you?"},
        "atranscription": {"file": "path/to/file"},
        "aspeech": {"input": "Hello, how are you?", "voice": "female"},
        "acreate_batch": {
            "completion_window": 10,
            "endpoint": "https://test.openai.azure.com",
            "input_file_id": "123",
        },
        "aretrieve_batch": {"batch_id": "123"},
        "aget_assistants": {"custom_llm_provider": "azure"},
        "acreate_assistants": {"custom_llm_provider": "azure"},
        "adelete_assistant": {"custom_llm_provider": "azure", "assistant_id": "123"},
        "acreate_thread": {"custom_llm_provider": "azure"},
        "aget_thread": {"custom_llm_provider": "azure", "thread_id": "123"},
        "a_add_message": {
            "custom_llm_provider": "azure",
            "thread_id": "123",
            "role": "user",
            "content": "Hello, how are you?",
        },
        "aget_messages": {"custom_llm_provider": "azure", "thread_id": "123"},
        "arun_thread": {
            "custom_llm_provider": "azure",
            "assistant_id": "123",
            "thread_id": "123",
        },
        "acreate_file": {
            "custom_llm_provider": "azure",
            "file": MagicMock(),
            "purpose": "assistants",
        },
    }
    # Get appropriate input for this call type
    input_kwarg = test_inputs.get(call_type.value, {})
    patch_target = "litellm.main.azure_chat_completions.initialize_azure_sdk_client"
    if call_type == CallTypes.atranscription:
        patch_target = (
            "litellm.main.azure_audio_transcriptions.initialize_azure_sdk_client"
        )
    elif call_type == CallTypes.arerank:
        patch_target = (
            "litellm.rerank_api.main.azure_rerank.initialize_azure_sdk_client"
        )
    elif call_type == CallTypes.acreate_batch or call_type == CallTypes.aretrieve_batch:
        patch_target = (
            "litellm.batches.main.azure_batches_instance.initialize_azure_sdk_client"
        )
    elif (
        call_type == CallTypes.aget_assistants
        or call_type == CallTypes.acreate_assistants
        or call_type == CallTypes.adelete_assistant
        or call_type == CallTypes.acreate_thread
        or call_type == CallTypes.aget_thread
        or call_type == CallTypes.a_add_message
        or call_type == CallTypes.aget_messages
        or call_type == CallTypes.arun_thread
    ):
        patch_target = (
            "litellm.assistants.main.azure_assistants_api.initialize_azure_sdk_client"
        )
    elif call_type == CallTypes.acreate_file or call_type == CallTypes.afile_content:
        patch_target = (
            "litellm.files.main.azure_files_instance.initialize_azure_sdk_client"
        )
    # Mock the initialize_azure_sdk_client function
    with patch(patch_target) as mock_init_azure:
        # Also mock async_function_with_fallbacks to prevent actual API calls
        # Call the appropriate router method
        try:
            get_attr = getattr(router, call_type.value, None)
            if get_attr is None:
                pytest.skip(
                    f"Skipping {call_type.value} because it is not supported on Router"
                )
            await getattr(router, call_type.value)(
                model="gpt-3.5-turbo",
                **input_kwarg,
                num_retries=0,
                azure_ad_token="oidc/test-token",
            )
        except Exception as e:
            traceback.print_exc()
        # Verify initialize_azure_sdk_client was called
        mock_init_azure.assert_called_once()
        # Verify it was called with the right model name
        calls = mock_init_azure.call_args_list
        azure_calls = [call for call in calls]
        litellm_params = azure_calls[0].kwargs["litellm_params"]
        print("litellm_params", litellm_params)
        assert (
            "azure_ad_token" in litellm_params
        ), "azure_ad_token not found in parameters"
        assert (
            litellm_params["azure_ad_token"] == "oidc/test-token"
        ), "azure_ad_token is not correct"
        # More detailed verification (optional)
        for call in azure_calls:
            assert "api_key" in call.kwargs, "api_key not found in parameters"
            assert "api_base" in call.kwargs, "api_base not found in parameters"
@pytest.mark.parametrize(
    "call_type",
    [
        CallTypes.atext_completion,
        CallTypes.acompletion,
    ],
 )
@pytest.mark.asyncio
 async def test_ensure_initialize_azure_sdk_client_always_used_azure_text(call_type):
    from litellm.router import Router
    # Create a router with an Azure model
    azure_model_name = "azure_text/chatgpt-v-2"
    router = Router(
        model_list=[
            {
                "model_name": "gpt-3.5-turbo",
                "litellm_params": {
                    "model": azure_model_name,
                    "api_key": "test-api-key",
                    "api_version": os.getenv("AZURE_API_VERSION", "2023-05-15"),
                    "api_base": os.getenv(
                        "AZURE_API_BASE", "https://test.openai.azure.com"
                    ),
                },
            }
        ],
    )
    # Prepare test input based on call type
    test_inputs = {
        "acompletion": {
            "messages": [{"role": "user", "content": "Hello, how are you?"}]
        },
        "atext_completion": {"prompt": "Hello, how are you?"},
    }
    # Get appropriate input for this call type
    input_kwarg = test_inputs.get(call_type.value, {})
    patch_target = "litellm.main.azure_text_completions.initialize_azure_sdk_client"
    # Mock the initialize_azure_sdk_client function
    with patch(patch_target) as mock_init_azure:
        # Also mock async_function_with_fallbacks to prevent actual API calls
        # Call the appropriate router method
        try:
            get_attr = getattr(router, call_type.value, None)
            if get_attr is None:
                pytest.skip(
                    f"Skipping {call_type.value} because it is not supported on Router"
                )
            await getattr(router, call_type.value)(
                model="gpt-3.5-turbo",
                **input_kwarg,
                num_retries=0,
                azure_ad_token="oidc/test-token",
            )
        except Exception as e:
            traceback.print_exc()
        # Verify initialize_azure_sdk_client was called
        mock_init_azure.assert_called_once()
        # Verify it was called with the right model name
        calls = mock_init_azure.call_args_list
        azure_calls = [call for call in calls]
        litellm_params = azure_calls[0].kwargs["litellm_params"]
        print("litellm_params", litellm_params)
        assert (
            "azure_ad_token" in litellm_params
        ), "azure_ad_token not found in parameters"
        assert (
            litellm_params["azure_ad_token"] == "oidc/test-token"
        ), "azure_ad_token is not correct"
        # More detailed verification (optional)
        for call in azure_calls:
            assert "api_key" in call.kwargs, "api_key not found in parameters"
            assert "api_base" in call.kwargs, "api_base not found in parameters"
--- a/tests/litellm/llms/openai/responses/test_openai_responses_transformation.py
+++ b/tests/litellm/llms/openai/responses/test_openai_responses_transformation.py
@ -0,0 +1,239 @@
 import json
 import os
 import sys
 from unittest.mock import AsyncMock, MagicMock, patch
 import httpx
 import pytest
 sys.path.insert(
    0, os.path.abspath("../../../../..")
 )  # Adds the parent directory to the system path
 from litellm.llms.openai.responses.transformation import OpenAIResponsesAPIConfig
 from litellm.types.llms.openai import (
    OutputTextDeltaEvent,
    ResponseCompletedEvent,
    ResponsesAPIRequestParams,
    ResponsesAPIResponse,
    ResponsesAPIStreamEvents,
 )
 class TestOpenAIResponsesAPIConfig:
    def setup_method(self):
        self.config = OpenAIResponsesAPIConfig()
        self.model = "gpt-4o"
        self.logging_obj = MagicMock()
    def test_map_openai_params(self):
        """Test that parameters are correctly mapped"""
        test_params = {"input": "Hello world", "temperature": 0.7, "stream": True}
        result = self.config.map_openai_params(
            response_api_optional_params=test_params,
            model=self.model,
            drop_params=False,
        )
        # The function should return the params unchanged
        assert result == test_params
    def validate_responses_api_request_params(self, params, expected_fields):
        """
        Validate that the params dict has the expected structure of ResponsesAPIRequestParams
        Args:
            params: The dict to validate
            expected_fields: Dict of field names and their expected values
        """
        # Check that it's a dict
        assert isinstance(params, dict), "Result should be a dict"
        # Check expected fields have correct values
        for field, value in expected_fields.items():
            assert field in params, f"Missing expected field: {field}"
            assert (
                params[field] == value
            ), f"Field {field} has value {params[field]}, expected {value}"
    def test_transform_responses_api_request(self):
        """Test request transformation"""
        input_text = "What is the capital of France?"
        optional_params = {"temperature": 0.7, "stream": True}
        result = self.config.transform_responses_api_request(
            model=self.model,
            input=input_text,
            response_api_optional_request_params=optional_params,
            litellm_params={},
            headers={},
        )
        # Validate the result has the expected structure and values
        expected_fields = {
            "model": self.model,
            "input": input_text,
            "temperature": 0.7,
            "stream": True,
        }
        self.validate_responses_api_request_params(result, expected_fields)
    def test_transform_streaming_response(self):
        """Test streaming response transformation"""
        # Test with a text delta event
        chunk = {
            "type": "response.output_text.delta",
            "item_id": "item_123",
            "output_index": 0,
            "content_index": 0,
            "delta": "Hello",
        }
        result = self.config.transform_streaming_response(
            model=self.model, parsed_chunk=chunk, logging_obj=self.logging_obj
        )
        assert isinstance(result, OutputTextDeltaEvent)
        assert result.type == ResponsesAPIStreamEvents.OUTPUT_TEXT_DELTA
        assert result.delta == "Hello"
        assert result.item_id == "item_123"
        # Test with a completed event - providing all required fields
        completed_chunk = {
            "type": "response.completed",
            "response": {
                "id": "resp_123",
                "created_at": 1234567890,
                "model": "gpt-4o",
                "object": "response",
                "output": [],
                "parallel_tool_calls": False,
                "error": None,
                "incomplete_details": None,
                "instructions": None,
                "metadata": None,
                "temperature": 0.7,
                "tool_choice": "auto",
                "tools": [],
                "top_p": 1.0,
                "max_output_tokens": None,
                "previous_response_id": None,
                "reasoning": None,
                "status": "completed",
                "text": None,
                "truncation": "auto",
                "usage": None,
                "user": None,
            },
        }
        # Mock the get_event_model_class to avoid validation issues in tests
        with patch.object(
            OpenAIResponsesAPIConfig, "get_event_model_class"
        ) as mock_get_class:
            mock_get_class.return_value = ResponseCompletedEvent
            result = self.config.transform_streaming_response(
                model=self.model,
                parsed_chunk=completed_chunk,
                logging_obj=self.logging_obj,
            )
            assert result.type == ResponsesAPIStreamEvents.RESPONSE_COMPLETED
            assert result.response.id == "resp_123"
    def test_validate_environment(self):
        """Test that validate_environment correctly sets the Authorization header"""
        # Test with provided API key
        headers = {}
        api_key = "test_api_key"
        result = self.config.validate_environment(
            headers=headers, model=self.model, api_key=api_key
        )
        assert "Authorization" in result
        assert result["Authorization"] == f"Bearer {api_key}"
        # Test with empty headers
        headers = {}
        with patch("litellm.api_key", "litellm_api_key"):
            result = self.config.validate_environment(headers=headers, model=self.model)
            assert "Authorization" in result
            assert result["Authorization"] == "Bearer litellm_api_key"
        # Test with existing headers
        headers = {"Content-Type": "application/json"}
        with patch("litellm.openai_key", "openai_key"):
            with patch("litellm.api_key", None):
                result = self.config.validate_environment(
                    headers=headers, model=self.model
                )
                assert "Authorization" in result
                assert result["Authorization"] == "Bearer openai_key"
                assert "Content-Type" in result
                assert result["Content-Type"] == "application/json"
        # Test with environment variable
        headers = {}
        with patch("litellm.api_key", None):
            with patch("litellm.openai_key", None):
                with patch(
                    "litellm.llms.openai.responses.transformation.get_secret_str",
                    return_value="env_api_key",
                ):
                    result = self.config.validate_environment(
                        headers=headers, model=self.model
                    )
                    assert "Authorization" in result
                    assert result["Authorization"] == "Bearer env_api_key"
    def test_get_complete_url(self):
        """Test that get_complete_url returns the correct URL"""
        # Test with provided API base
        api_base = "https://custom-openai.example.com/v1"
        result = self.config.get_complete_url(api_base=api_base, model=self.model)
        assert result == "https://custom-openai.example.com/v1/responses"
        # Test with litellm.api_base
        with patch("litellm.api_base", "https://litellm-api-base.example.com/v1"):
            result = self.config.get_complete_url(api_base=None, model=self.model)
            assert result == "https://litellm-api-base.example.com/v1/responses"
        # Test with environment variable
        with patch("litellm.api_base", None):
            with patch(
                "litellm.llms.openai.responses.transformation.get_secret_str",
                return_value="https://env-api-base.example.com/v1",
            ):
                result = self.config.get_complete_url(api_base=None, model=self.model)
                assert result == "https://env-api-base.example.com/v1/responses"
        # Test with default API base
        with patch("litellm.api_base", None):
            with patch(
                "litellm.llms.openai.responses.transformation.get_secret_str",
                return_value=None,
            ):
                result = self.config.get_complete_url(api_base=None, model=self.model)
                assert result == "https://api.openai.com/v1/responses"
        # Test with trailing slash in API base
        api_base = "https://custom-openai.example.com/v1/"
        result = self.config.get_complete_url(api_base=api_base, model=self.model)
        assert result == "https://custom-openai.example.com/v1/responses"
--- a/tests/litellm/responses/test_responses_utils.py
+++ b/tests/litellm/responses/test_responses_utils.py
@ -0,0 +1,150 @@
 import json
 import os
 import sys
 import pytest
 from fastapi.testclient import TestClient
 sys.path.insert(
    0, os.path.abspath("../../..")
 )  # Adds the parent directory to the system path
 import litellm
 from litellm.llms.base_llm.responses.transformation import BaseResponsesAPIConfig
 from litellm.llms.openai.responses.transformation import OpenAIResponsesAPIConfig
 from litellm.responses.utils import ResponseAPILoggingUtils, ResponsesAPIRequestUtils
 from litellm.types.llms.openai import ResponsesAPIOptionalRequestParams
 from litellm.types.utils import Usage
 class TestResponsesAPIRequestUtils:
    def test_get_optional_params_responses_api(self):
        """Test that optional parameters are correctly processed for responses API"""
        # Setup
        model = "gpt-4o"
        config = OpenAIResponsesAPIConfig()
        optional_params = ResponsesAPIOptionalRequestParams(
            {"temperature": 0.7, "max_output_tokens": 100}
        )
        # Execute
        result = ResponsesAPIRequestUtils.get_optional_params_responses_api(
            model=model,
            responses_api_provider_config=config,
            response_api_optional_params=optional_params,
        )
        # Assert
        assert result == optional_params
        assert "temperature" in result
        assert result["temperature"] == 0.7
        assert "max_output_tokens" in result
        assert result["max_output_tokens"] == 100
    def test_get_optional_params_responses_api_unsupported_param(self):
        """Test that unsupported parameters raise an error"""
        # Setup
        model = "gpt-4o"
        config = OpenAIResponsesAPIConfig()
        optional_params = ResponsesAPIOptionalRequestParams(
            {"temperature": 0.7, "unsupported_param": "value"}
        )
        # Execute and Assert
        with pytest.raises(litellm.UnsupportedParamsError) as excinfo:
            ResponsesAPIRequestUtils.get_optional_params_responses_api(
                model=model,
                responses_api_provider_config=config,
                response_api_optional_params=optional_params,
            )
        assert "unsupported_param" in str(excinfo.value)
        assert model in str(excinfo.value)
    def test_get_requested_response_api_optional_param(self):
        """Test filtering parameters to only include those in ResponsesAPIOptionalRequestParams"""
        # Setup
        params = {
            "temperature": 0.7,
            "max_output_tokens": 100,
            "invalid_param": "value",
            "model": "gpt-4o",  # This is not in ResponsesAPIOptionalRequestParams
        }
        # Execute
        result = ResponsesAPIRequestUtils.get_requested_response_api_optional_param(
            params
        )
        # Assert
        assert "temperature" in result
        assert "max_output_tokens" in result
        assert "invalid_param" not in result
        assert "model" not in result
        assert result["temperature"] == 0.7
        assert result["max_output_tokens"] == 100
 class TestResponseAPILoggingUtils:
    def test_is_response_api_usage_true(self):
        """Test identification of Response API usage format"""
        # Setup
        usage = {"input_tokens": 10, "output_tokens": 20}
        # Execute
        result = ResponseAPILoggingUtils._is_response_api_usage(usage)
        # Assert
        assert result is True
    def test_is_response_api_usage_false(self):
        """Test identification of non-Response API usage format"""
        # Setup
        usage = {"prompt_tokens": 10, "completion_tokens": 20, "total_tokens": 30}
        # Execute
        result = ResponseAPILoggingUtils._is_response_api_usage(usage)
        # Assert
        assert result is False
    def test_transform_response_api_usage_to_chat_usage(self):
        """Test transformation from Response API usage to Chat usage format"""
        # Setup
        usage = {
            "input_tokens": 10,
            "output_tokens": 20,
            "total_tokens": 30,
            "output_tokens_details": {"reasoning_tokens": 5},
        }
        # Execute
        result = ResponseAPILoggingUtils._transform_response_api_usage_to_chat_usage(
            usage
        )
        # Assert
        assert isinstance(result, Usage)
        assert result.prompt_tokens == 10
        assert result.completion_tokens == 20
        assert result.total_tokens == 30
    def test_transform_response_api_usage_with_none_values(self):
        """Test transformation handles None values properly"""
        # Setup
        usage = {
            "input_tokens": 0,  # Changed from None to 0
            "output_tokens": 20,
            "total_tokens": 20,
            "output_tokens_details": {"reasoning_tokens": 5},
        }
        # Execute
        result = ResponseAPILoggingUtils._transform_response_api_usage_to_chat_usage(
            usage
        )
        # Assert
        assert result.prompt_tokens == 0
        assert result.completion_tokens == 20
        assert result.total_tokens == 20
--- a/tests/llm_responses_api_testing/conftest.py
+++ b/tests/llm_responses_api_testing/conftest.py
@ -0,0 +1,63 @@
 # conftest.py
 import importlib
 import os
 import sys
 import pytest
 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import litellm
@pytest.fixture(scope="function", autouse=True)
 def setup_and_teardown():
    """
    This fixture reloads litellm before every function. To speed up testing by removing callbacks being chained.
    """
    curr_dir = os.getcwd()  # Get the current working directory
    sys.path.insert(
        0, os.path.abspath("../..")
    )  # Adds the project directory to the system path
    import litellm
    from litellm import Router
    importlib.reload(litellm)
    try:
        if hasattr(litellm, "proxy") and hasattr(litellm.proxy, "proxy_server"):
            import litellm.proxy.proxy_server
            importlib.reload(litellm.proxy.proxy_server)
    except Exception as e:
        print(f"Error reloading litellm.proxy.proxy_server: {e}")
    import asyncio
    loop = asyncio.get_event_loop_policy().new_event_loop()
    asyncio.set_event_loop(loop)
    print(litellm)
    # from litellm import Router, completion, aembedding, acompletion, embedding
    yield
    # Teardown code (executes after the yield point)
    loop.close()  # Close the loop created earlier
    asyncio.set_event_loop(None)  # Remove the reference to the loop
 def pytest_collection_modifyitems(config, items):
    # Separate tests in 'test_amazing_proxy_custom_logger.py' and other tests
    custom_logger_tests = [
        item for item in items if "custom_logger" in item.parent.name
    ]
    other_tests = [item for item in items if "custom_logger" not in item.parent.name]
    # Sort tests based on their names
    custom_logger_tests.sort(key=lambda x: x.name)
    other_tests.sort(key=lambda x: x.name)
    # Reorder the items list
    items[:] = custom_logger_tests + other_tests
--- a/tests/llm_responses_api_testing/test_openai_responses_api.py
+++ b/tests/llm_responses_api_testing/test_openai_responses_api.py
@ -0,0 +1,797 @@
 import os
 import sys
 import pytest
 import asyncio
 from typing import Optional
 from unittest.mock import patch, AsyncMock
 sys.path.insert(0, os.path.abspath("../.."))
 import litellm
 from litellm.integrations.custom_logger import CustomLogger
 import json
 from litellm.types.utils import StandardLoggingPayload
 from litellm.types.llms.openai import (
    ResponseCompletedEvent,
    ResponsesAPIResponse,
    ResponseTextConfig,
    ResponseAPIUsage,
    IncompleteDetails,
 )
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
 def validate_responses_api_response(response, final_chunk: bool = False):
    """
    Validate that a response from litellm.responses() or litellm.aresponses()
    conforms to the expected ResponsesAPIResponse structure.
    Args:
        response: The response object to validate
    Raises:
        AssertionError: If the response doesn't match the expected structure
    """
    # Validate response structure
    print("response=", json.dumps(response, indent=4, default=str))
    assert isinstance(
        response, ResponsesAPIResponse
    ), "Response should be an instance of ResponsesAPIResponse"
    # Required fields
    assert "id" in response and isinstance(
        response["id"], str
    ), "Response should have a string 'id' field"
    assert "created_at" in response and isinstance(
        response["created_at"], (int, float)
    ), "Response should have a numeric 'created_at' field"
    assert "output" in response and isinstance(
        response["output"], list
    ), "Response should have a list 'output' field"
    assert "parallel_tool_calls" in response and isinstance(
        response["parallel_tool_calls"], bool
    ), "Response should have a boolean 'parallel_tool_calls' field"
    # Optional fields with their expected types
    optional_fields = {
        "error": (dict, type(None)),  # error can be dict or None
        "incomplete_details": (IncompleteDetails, type(None)),
        "instructions": (str, type(None)),
        "metadata": dict,
        "model": str,
        "object": str,
        "temperature": (int, float),
        "tool_choice": (dict, str),
        "tools": list,
        "top_p": (int, float),
        "max_output_tokens": (int, type(None)),
        "previous_response_id": (str, type(None)),
        "reasoning": dict,
        "status": str,
        "text": ResponseTextConfig,
        "truncation": str,
        "usage": ResponseAPIUsage,
        "user": (str, type(None)),
    }
    if final_chunk is False:
        optional_fields["usage"] = type(None)
    for field, expected_type in optional_fields.items():
        if field in response:
            assert isinstance(
                response[field], expected_type
            ), f"Field '{field}' should be of type {expected_type}, but got {type(response[field])}"
    # Check if output has at least one item
    if final_chunk is True:
        assert (
            len(response["output"]) > 0
        ), "Response 'output' field should have at least one item"
    return True  # Return True if validation passes
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
 async def test_basic_openai_responses_api(sync_mode):
    litellm._turn_on_debug()
    if sync_mode:
        response = litellm.responses(
            model="gpt-4o", input="Basic ping", max_output_tokens=20
        )
    else:
        response = await litellm.aresponses(
            model="gpt-4o", input="Basic ping", max_output_tokens=20
        )
    print("litellm response=", json.dumps(response, indent=4, default=str))
    # Use the helper function to validate the response
    validate_responses_api_response(response, final_chunk=True)
@pytest.mark.parametrize("sync_mode", [True])
@pytest.mark.asyncio
 async def test_basic_openai_responses_api_streaming(sync_mode):
    litellm._turn_on_debug()
    if sync_mode:
        response = litellm.responses(
            model="gpt-4o",
            input="Basic ping",
            stream=True,
        )
        for event in response:
            print("litellm response=", json.dumps(event, indent=4, default=str))
    else:
        response = await litellm.aresponses(
            model="gpt-4o",
            input="Basic ping",
            stream=True,
        )
        async for event in response:
            print("litellm response=", json.dumps(event, indent=4, default=str))
 class TestCustomLogger(CustomLogger):
    def __init__(
        self,
    ):
        self.standard_logging_object: Optional[StandardLoggingPayload] = None
    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
        print("in async_log_success_event")
        print("kwargs=", json.dumps(kwargs, indent=4, default=str))
        self.standard_logging_object = kwargs["standard_logging_object"]
        pass
 def validate_standard_logging_payload(
    slp: StandardLoggingPayload, response: ResponsesAPIResponse, request_model: str
 ):
    """
    Validate that a StandardLoggingPayload object matches the expected response
    Args:
        slp (StandardLoggingPayload): The standard logging payload object to validate
        response (dict): The litellm response to compare against
        request_model (str): The model name that was requested
    """
    # Validate payload exists
    assert slp is not None, "Standard logging payload should not be None"
    # Validate token counts
    print("response=", json.dumps(response, indent=4, default=str))
    assert (
        slp["prompt_tokens"] == response["usage"]["input_tokens"]
    ), "Prompt tokens mismatch"
    assert (
        slp["completion_tokens"] == response["usage"]["output_tokens"]
    ), "Completion tokens mismatch"
    assert (
        slp["total_tokens"]
        == response["usage"]["input_tokens"] + response["usage"]["output_tokens"]
    ), "Total tokens mismatch"
    # Validate spend and response metadata
    assert slp["response_cost"] > 0, "Response cost should be greater than 0"
    assert slp["id"] == response["id"], "Response ID mismatch"
    assert slp["model"] == request_model, "Model name mismatch"
    # Validate messages
    assert slp["messages"] == [{"content": "hi", "role": "user"}], "Messages mismatch"
    # Validate complete response structure
    validate_responses_match(slp["response"], response)
@pytest.mark.asyncio
 async def test_basic_openai_responses_api_streaming_with_logging():
    litellm._turn_on_debug()
    litellm.set_verbose = True
    test_custom_logger = TestCustomLogger()
    litellm.callbacks = [test_custom_logger]
    request_model = "gpt-4o"
    response = await litellm.aresponses(
        model=request_model,
        input="hi",
        stream=True,
    )
    final_response: Optional[ResponseCompletedEvent] = None
    async for event in response:
        if event.type == "response.completed":
            final_response = event
        print("litellm response=", json.dumps(event, indent=4, default=str))
    print("sleeping for 2 seconds...")
    await asyncio.sleep(2)
    print(
        "standard logging payload=",
        json.dumps(test_custom_logger.standard_logging_object, indent=4, default=str),
    )
    assert final_response is not None
    assert test_custom_logger.standard_logging_object is not None
    validate_standard_logging_payload(
        slp=test_custom_logger.standard_logging_object,
        response=final_response.response,
        request_model=request_model,
    )
 def validate_responses_match(slp_response, litellm_response):
    """Validate that the standard logging payload OpenAI response matches the litellm response"""
    # Validate core fields
    assert slp_response["id"] == litellm_response["id"], "ID mismatch"
    assert slp_response["model"] == litellm_response["model"], "Model mismatch"
    assert (
        slp_response["created_at"] == litellm_response["created_at"]
    ), "Created at mismatch"
    # Validate usage
    assert (
        slp_response["usage"]["input_tokens"]
        == litellm_response["usage"]["input_tokens"]
    ), "Input tokens mismatch"
    assert (
        slp_response["usage"]["output_tokens"]
        == litellm_response["usage"]["output_tokens"]
    ), "Output tokens mismatch"
    assert (
        slp_response["usage"]["total_tokens"]
        == litellm_response["usage"]["total_tokens"]
    ), "Total tokens mismatch"
    # Validate output/messages
    assert len(slp_response["output"]) == len(
        litellm_response["output"]
    ), "Output length mismatch"
    for slp_msg, litellm_msg in zip(slp_response["output"], litellm_response["output"]):
        assert slp_msg["role"] == litellm_msg.role, "Message role mismatch"
        # Access the content's text field for the litellm response
        litellm_content = litellm_msg.content[0].text if litellm_msg.content else ""
        assert (
            slp_msg["content"][0]["text"] == litellm_content
        ), f"Message content mismatch. Expected {litellm_content}, Got {slp_msg['content']}"
        assert slp_msg["status"] == litellm_msg.status, "Message status mismatch"
@pytest.mark.asyncio
 async def test_basic_openai_responses_api_non_streaming_with_logging():
    litellm._turn_on_debug()
    litellm.set_verbose = True
    test_custom_logger = TestCustomLogger()
    litellm.callbacks = [test_custom_logger]
    request_model = "gpt-4o"
    response = await litellm.aresponses(
        model=request_model,
        input="hi",
    )
    print("litellm response=", json.dumps(response, indent=4, default=str))
    print("response hidden params=", response._hidden_params)
    print("sleeping for 2 seconds...")
    await asyncio.sleep(2)
    print(
        "standard logging payload=",
        json.dumps(test_custom_logger.standard_logging_object, indent=4, default=str),
    )
    assert response is not None
    assert test_custom_logger.standard_logging_object is not None
    validate_standard_logging_payload(
        test_custom_logger.standard_logging_object, response, request_model
    )
 def validate_stream_event(event):
    """
    Validate that a streaming event from litellm.responses() or litellm.aresponses()
    with stream=True conforms to the expected structure based on its event type.
    Args:
        event: The streaming event object to validate
    Raises:
        AssertionError: If the event doesn't match the expected structure for its type
    """
    # Common validation for all event types
    assert hasattr(event, "type"), "Event should have a 'type' attribute"
    # Type-specific validation
    if event.type == "response.created" or event.type == "response.in_progress":
        assert hasattr(
            event, "response"
        ), f"{event.type} event should have a 'response' attribute"
        validate_responses_api_response(event.response, final_chunk=False)
    elif event.type == "response.completed":
        assert hasattr(
            event, "response"
        ), "response.completed event should have a 'response' attribute"
        validate_responses_api_response(event.response, final_chunk=True)
        # Usage is guaranteed only on the completed event
        assert (
            "usage" in event.response
        ), "response.completed event should have usage information"
        print("Usage in event.response=", event.response["usage"])
        assert isinstance(event.response["usage"], ResponseAPIUsage)
    elif event.type == "response.failed" or event.type == "response.incomplete":
        assert hasattr(
            event, "response"
        ), f"{event.type} event should have a 'response' attribute"
    elif (
        event.type == "response.output_item.added"
        or event.type == "response.output_item.done"
    ):
        assert hasattr(
            event, "output_index"
        ), f"{event.type} event should have an 'output_index' attribute"
        assert hasattr(
            event, "item"
        ), f"{event.type} event should have an 'item' attribute"
    elif (
        event.type == "response.content_part.added"
        or event.type == "response.content_part.done"
    ):
        assert hasattr(
            event, "item_id"
        ), f"{event.type} event should have an 'item_id' attribute"
        assert hasattr(
            event, "output_index"
        ), f"{event.type} event should have an 'output_index' attribute"
        assert hasattr(
            event, "content_index"
        ), f"{event.type} event should have a 'content_index' attribute"
        assert hasattr(
            event, "part"
        ), f"{event.type} event should have a 'part' attribute"
    elif event.type == "response.output_text.delta":
        assert hasattr(
            event, "item_id"
        ), f"{event.type} event should have an 'item_id' attribute"
        assert hasattr(
            event, "output_index"
        ), f"{event.type} event should have an 'output_index' attribute"
        assert hasattr(
            event, "content_index"
        ), f"{event.type} event should have a 'content_index' attribute"
        assert hasattr(
            event, "delta"
        ), f"{event.type} event should have a 'delta' attribute"
    elif event.type == "response.output_text.annotation.added":
        assert hasattr(
            event, "item_id"
        ), f"{event.type} event should have an 'item_id' attribute"
        assert hasattr(
            event, "output_index"
        ), f"{event.type} event should have an 'output_index' attribute"
        assert hasattr(
            event, "content_index"
        ), f"{event.type} event should have a 'content_index' attribute"
        assert hasattr(
            event, "annotation_index"
        ), f"{event.type} event should have an 'annotation_index' attribute"
        assert hasattr(
            event, "annotation"
        ), f"{event.type} event should have an 'annotation' attribute"
    elif event.type == "response.output_text.done":
        assert hasattr(
            event, "item_id"
        ), f"{event.type} event should have an 'item_id' attribute"
        assert hasattr(
            event, "output_index"
        ), f"{event.type} event should have an 'output_index' attribute"
        assert hasattr(
            event, "content_index"
        ), f"{event.type} event should have a 'content_index' attribute"
        assert hasattr(
            event, "text"
        ), f"{event.type} event should have a 'text' attribute"
    elif event.type == "response.refusal.delta":
        assert hasattr(
            event, "item_id"
        ), f"{event.type} event should have an 'item_id' attribute"
        assert hasattr(
            event, "output_index"
        ), f"{event.type} event should have an 'output_index' attribute"
        assert hasattr(
            event, "content_index"
        ), f"{event.type} event should have a 'content_index' attribute"
        assert hasattr(
            event, "delta"
        ), f"{event.type} event should have a 'delta' attribute"
    elif event.type == "response.refusal.done":
        assert hasattr(
            event, "item_id"
        ), f"{event.type} event should have an 'item_id' attribute"
        assert hasattr(
            event, "output_index"
        ), f"{event.type} event should have an 'output_index' attribute"
        assert hasattr(
            event, "content_index"
        ), f"{event.type} event should have a 'content_index' attribute"
        assert hasattr(
            event, "refusal"
        ), f"{event.type} event should have a 'refusal' attribute"
    elif event.type == "response.function_call_arguments.delta":
        assert hasattr(
            event, "item_id"
        ), f"{event.type} event should have an 'item_id' attribute"
        assert hasattr(
            event, "output_index"
        ), f"{event.type} event should have an 'output_index' attribute"
        assert hasattr(
            event, "delta"
        ), f"{event.type} event should have a 'delta' attribute"
    elif event.type == "response.function_call_arguments.done":
        assert hasattr(
            event, "item_id"
        ), f"{event.type} event should have an 'item_id' attribute"
        assert hasattr(
            event, "output_index"
        ), f"{event.type} event should have an 'output_index' attribute"
        assert hasattr(
            event, "arguments"
        ), f"{event.type} event should have an 'arguments' attribute"
    elif event.type in [
        "response.file_search_call.in_progress",
        "response.file_search_call.searching",
        "response.file_search_call.completed",
        "response.web_search_call.in_progress",
        "response.web_search_call.searching",
        "response.web_search_call.completed",
    ]:
        assert hasattr(
            event, "output_index"
        ), f"{event.type} event should have an 'output_index' attribute"
        assert hasattr(
            event, "item_id"
        ), f"{event.type} event should have an 'item_id' attribute"
    elif event.type == "error":
        assert hasattr(
            event, "message"
        ), "Error event should have a 'message' attribute"
    return True  # Return True if validation passes
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
 async def test_openai_responses_api_streaming_validation(sync_mode):
    """Test that validates each streaming event from the responses API"""
    litellm._turn_on_debug()
    event_types_seen = set()
    if sync_mode:
        response = litellm.responses(
            model="gpt-4o",
            input="Tell me about artificial intelligence in 3 sentences.",
            stream=True,
        )
        for event in response:
            print(f"Validating event type: {event.type}")
            validate_stream_event(event)
            event_types_seen.add(event.type)
    else:
        response = await litellm.aresponses(
            model="gpt-4o",
            input="Tell me about artificial intelligence in 3 sentences.",
            stream=True,
        )
        async for event in response:
            print(f"Validating event type: {event.type}")
            validate_stream_event(event)
            event_types_seen.add(event.type)
    # At minimum, we should see these core event types
    required_events = {"response.created", "response.completed"}
    missing_events = required_events - event_types_seen
    assert not missing_events, f"Missing required event types: {missing_events}"
    print(f"Successfully validated all event types: {event_types_seen}")
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
 async def test_openai_responses_litellm_router(sync_mode):
    """
    Test the OpenAI responses API with LiteLLM Router in both sync and async modes
    """
    litellm._turn_on_debug()
    router = litellm.Router(
        model_list=[
            {
                "model_name": "gpt4o-special-alias",
                "litellm_params": {
                    "model": "gpt-4o",
                    "api_key": os.getenv("OPENAI_API_KEY"),
                },
            }
        ]
    )
    # Call the handler
    if sync_mode:
        response = router.responses(
            model="gpt4o-special-alias",
            input="Hello, can you tell me a short joke?",
            max_output_tokens=100,
        )
        print("SYNC MODE RESPONSE=", response)
    else:
        response = await router.aresponses(
            model="gpt4o-special-alias",
            input="Hello, can you tell me a short joke?",
            max_output_tokens=100,
        )
    print(
        f"Router {'sync' if sync_mode else 'async'} response=",
        json.dumps(response, indent=4, default=str),
    )
    # Use the helper function to validate the response
    validate_responses_api_response(response, final_chunk=True)
    return response
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
 async def test_openai_responses_litellm_router_streaming(sync_mode):
    """
    Test the OpenAI responses API with streaming through LiteLLM Router
    """
    litellm._turn_on_debug()
    router = litellm.Router(
        model_list=[
            {
                "model_name": "gpt4o-special-alias",
                "litellm_params": {
                    "model": "gpt-4o",
                    "api_key": os.getenv("OPENAI_API_KEY"),
                },
            }
        ]
    )
    event_types_seen = set()
    if sync_mode:
        response = router.responses(
            model="gpt4o-special-alias",
            input="Tell me about artificial intelligence in 2 sentences.",
            stream=True,
        )
        for event in response:
            print(f"Validating event type: {event.type}")
            validate_stream_event(event)
            event_types_seen.add(event.type)
    else:
        response = await router.aresponses(
            model="gpt4o-special-alias",
            input="Tell me about artificial intelligence in 2 sentences.",
            stream=True,
        )
        async for event in response:
            print(f"Validating event type: {event.type}")
            validate_stream_event(event)
            event_types_seen.add(event.type)
    # At minimum, we should see these core event types
    required_events = {"response.created", "response.completed"}
    missing_events = required_events - event_types_seen
    assert not missing_events, f"Missing required event types: {missing_events}"
    print(f"Successfully validated all event types: {event_types_seen}")
@pytest.mark.asyncio
 async def test_openai_responses_litellm_router_no_metadata():
    """
    Test that metadata is not passed through when using the Router for responses API
    """
    mock_response = {
        "id": "resp_123",
        "object": "response",
        "created_at": 1741476542,
        "status": "completed",
        "model": "gpt-4o",
        "output": [
            {
                "type": "message",
                "id": "msg_123",
                "status": "completed",
                "role": "assistant",
                "content": [
                    {"type": "output_text", "text": "Hello world!", "annotations": []}
                ],
            }
        ],
        "parallel_tool_calls": True,
        "usage": {
            "input_tokens": 10,
            "output_tokens": 20,
            "total_tokens": 30,
            "output_tokens_details": {"reasoning_tokens": 0},
        },
        "text": {"format": {"type": "text"}},
        # Adding all required fields
        "error": None,
        "incomplete_details": None,
        "instructions": None,
        "metadata": {},
        "temperature": 1.0,
        "tool_choice": "auto",
        "tools": [],
        "top_p": 1.0,
        "max_output_tokens": None,
        "previous_response_id": None,
        "reasoning": {"effort": None, "summary": None},
        "truncation": "disabled",
        "user": None,
    }
    class MockResponse:
        def __init__(self, json_data, status_code):
            self._json_data = json_data
            self.status_code = status_code
            self.text = str(json_data)
        def json(self):  # Changed from async to sync
            return self._json_data
    with patch(
        "litellm.llms.custom_httpx.http_handler.AsyncHTTPHandler.post",
        new_callable=AsyncMock,
    ) as mock_post:
        # Configure the mock to return our response
        mock_post.return_value = MockResponse(mock_response, 200)
        litellm._turn_on_debug()
        router = litellm.Router(
            model_list=[
                {
                    "model_name": "gpt4o-special-alias",
                    "litellm_params": {
                        "model": "gpt-4o",
                        "api_key": "fake-key",
                    },
                }
            ]
        )
        # Call the handler with metadata
        await router.aresponses(
            model="gpt4o-special-alias",
            input="Hello, can you tell me a short joke?",
        )
        # Check the request body
        request_body = mock_post.call_args.kwargs["data"]
        print("Request body:", json.dumps(request_body, indent=4))
        loaded_request_body = json.loads(request_body)
        print("Loaded request body:", json.dumps(loaded_request_body, indent=4))
        # Assert metadata is not in the request
        assert (
            loaded_request_body["metadata"] == None
        ), "metadata should not be in the request body"
        mock_post.assert_called_once()
@pytest.mark.asyncio
 async def test_openai_responses_litellm_router_with_metadata():
    """
    Test that metadata is correctly passed through when explicitly provided to the Router for responses API
    """
    test_metadata = {
        "user_id": "123",
        "conversation_id": "abc",
        "custom_field": "test_value",
    }
    mock_response = {
        "id": "resp_123",
        "object": "response",
        "created_at": 1741476542,
        "status": "completed",
        "model": "gpt-4o",
        "output": [
            {
                "type": "message",
                "id": "msg_123",
                "status": "completed",
                "role": "assistant",
                "content": [
                    {"type": "output_text", "text": "Hello world!", "annotations": []}
                ],
            }
        ],
        "parallel_tool_calls": True,
        "usage": {
            "input_tokens": 10,
            "output_tokens": 20,
            "total_tokens": 30,
            "output_tokens_details": {"reasoning_tokens": 0},
        },
        "text": {"format": {"type": "text"}},
        "error": None,
        "incomplete_details": None,
        "instructions": None,
        "metadata": test_metadata,  # Include the test metadata in response
        "temperature": 1.0,
        "tool_choice": "auto",
        "tools": [],
        "top_p": 1.0,
        "max_output_tokens": None,
        "previous_response_id": None,
        "reasoning": {"effort": None, "summary": None},
        "truncation": "disabled",
        "user": None,
    }
    class MockResponse:
        def __init__(self, json_data, status_code):
            self._json_data = json_data
            self.status_code = status_code
            self.text = str(json_data)
        def json(self):
            return self._json_data
    with patch(
        "litellm.llms.custom_httpx.http_handler.AsyncHTTPHandler.post",
        new_callable=AsyncMock,
    ) as mock_post:
        # Configure the mock to return our response
        mock_post.return_value = MockResponse(mock_response, 200)
        litellm._turn_on_debug()
        router = litellm.Router(
            model_list=[
                {
                    "model_name": "gpt4o-special-alias",
                    "litellm_params": {
                        "model": "gpt-4o",
                        "api_key": "fake-key",
                    },
                }
            ]
        )
        # Call the handler with metadata
        await router.aresponses(
            model="gpt4o-special-alias",
            input="Hello, can you tell me a short joke?",
            metadata=test_metadata,
        )
        # Check the request body
        request_body = mock_post.call_args.kwargs["data"]
        loaded_request_body = json.loads(request_body)
        print("Request body:", json.dumps(loaded_request_body, indent=4))
        # Assert metadata matches exactly what was passed
        assert (
            loaded_request_body["metadata"] == test_metadata
        ), "metadata in request body should match what was passed"
        mock_post.assert_called_once()
--- a/tests/llm_translation/base_llm_unit_tests.py
+++ b/tests/llm_translation/base_llm_unit_tests.py
@ -868,10 +868,13 @@ class BaseLLMChatTest(ABC):
        except Exception as e:
            pytest.fail(f"Error occurred: {e}")
    @pytest.mark.flaky(retries=3, delay=1)
    @pytest.mark.asyncio
    async def test_completion_cost(self):
        from litellm import completion_cost
        litellm._turn_on_debug()
        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
        litellm.model_cost = litellm.get_model_cost_map(url="")
--- a/tests/llm_translation/test_azure_openai.py
+++ b/tests/llm_translation/test_azure_openai.py
@ -522,7 +522,7 @@ async def test_async_azure_max_retries_0(
@pytest.mark.parametrize("max_retries", [0, 4])
@pytest.mark.parametrize("stream", [True, False])
@pytest.mark.parametrize("sync_mode", [True, False])
-@patch("litellm.llms.azure.completion.handler.select_azure_base_url_or_endpoint")
+@patch("litellm.llms.azure.common_utils.select_azure_base_url_or_endpoint")
@pytest.mark.asyncio
 async def test_azure_instruct(
    mock_select_azure_base_url_or_endpoint, max_retries, stream, sync_mode
@ -556,12 +556,11 @@ async def test_azure_instruct(
@pytest.mark.parametrize("max_retries", [0, 4])
@pytest.mark.parametrize("stream", [True, False])
@pytest.mark.parametrize("sync_mode", [True, False])
-@patch("litellm.llms.azure.azure.select_azure_base_url_or_endpoint")
+@patch("litellm.llms.azure.common_utils.select_azure_base_url_or_endpoint")
@pytest.mark.asyncio
 async def test_azure_embedding_max_retries_0(
-    mock_select_azure_base_url_or_endpoint, max_retries, stream, sync_mode
+    mock_select_azure_base_url_or_endpoint, max_retries, sync_mode
 ):
    from litellm import aembedding, embedding
@ -569,7 +568,6 @@ async def test_azure_embedding_max_retries_0(
        "model": "azure/azure-embedding-model",
        "input": "Hello world",
        "max_retries": max_retries,
        "stream": stream,
    }
    try:
@ -581,6 +579,10 @@ async def test_azure_embedding_max_retries_0(
        print(e)
    mock_select_azure_base_url_or_endpoint.assert_called_once()
    print(
        "mock_select_azure_base_url_or_endpoint.call_args.kwargs",
        mock_select_azure_base_url_or_endpoint.call_args.kwargs,
    )
    assert (
        mock_select_azure_base_url_or_endpoint.call_args.kwargs["azure_client_params"][
            "max_retries"
--- a/tests/local_testing/test_completion.py
+++ b/tests/local_testing/test_completion.py
@ -2933,13 +2933,19 @@ def test_completion_azure():
 # test_completion_azure()
@pytest.mark.skip(
    reason="this is bad test. It doesn't actually fail if the token is not set in the header. "
 )
 def test_azure_openai_ad_token():
    import time
    # this tests if the azure ad token is set in the request header
    # the request can fail since azure ad tokens expire after 30 mins, but the header MUST have the azure ad token
    # we use litellm.input_callbacks for this test
    def tester(
        kwargs,  # kwargs to completion
    ):
        print("inside kwargs")
        print(kwargs["additional_args"])
        if kwargs["additional_args"]["headers"]["Authorization"] != "Bearer gm":
            pytest.fail("AZURE AD TOKEN Passed but not set in request header")
@ -2962,7 +2968,9 @@ def test_azure_openai_ad_token():
        litellm.input_callback = []
    except Exception as e:
        litellm.input_callback = []
-        pytest.fail(f"An exception occurs - {str(e)}")
+        pass
    time.sleep(1)
 # test_azure_openai_ad_token()
--- a/tests/local_testing/test_completion_cost.py
+++ b/tests/local_testing/test_completion_cost.py
@ -2769,6 +2769,7 @@ def test_add_known_models():
    )
@pytest.mark.skip(reason="flaky test")
 def test_bedrock_cost_calc_with_region():
    from litellm import completion
--- a/tests/local_testing/test_pass_through_endpoints.py
+++ b/tests/local_testing/test_pass_through_endpoints.py
@ -329,3 +329,71 @@ async def test_aaapass_through_endpoint_pass_through_keys_langfuse(
        setattr(
            litellm.proxy.proxy_server, "proxy_logging_obj", original_proxy_logging_obj
        )
@pytest.mark.asyncio
 async def test_pass_through_endpoint_bing(client, monkeypatch):
    import litellm
    captured_requests = []
    async def mock_bing_request(*args, **kwargs):
        captured_requests.append((args, kwargs))
        mock_response = httpx.Response(
            200,
            json={
                "_type": "SearchResponse",
                "queryContext": {"originalQuery": "bob barker"},
                "webPages": {
                    "webSearchUrl": "https://www.bing.com/search?q=bob+barker",
                    "totalEstimatedMatches": 12000000,
                    "value": [],
                },
            },
        )
        mock_response.request = Mock(spec=httpx.Request)
        return mock_response
    monkeypatch.setattr("httpx.AsyncClient.request", mock_bing_request)
    # Define a pass-through endpoint
    pass_through_endpoints = [
        {
            "path": "/bing/search",
            "target": "https://api.bing.microsoft.com/v7.0/search?setLang=en-US&mkt=en-US",
            "headers": {"Ocp-Apim-Subscription-Key": "XX"},
            "forward_headers": True,
            # Additional settings
            "merge_query_params": True,
            "auth": True,
        },
        {
            "path": "/bing/search-no-merge-params",
            "target": "https://api.bing.microsoft.com/v7.0/search?setLang=en-US&mkt=en-US",
            "headers": {"Ocp-Apim-Subscription-Key": "XX"},
            "forward_headers": True,
        },
    ]
    # Initialize the pass-through endpoint
    await initialize_pass_through_endpoints(pass_through_endpoints)
    general_settings: Optional[dict] = (
        getattr(litellm.proxy.proxy_server, "general_settings", {}) or {}
    )
    general_settings.update({"pass_through_endpoints": pass_through_endpoints})
    setattr(litellm.proxy.proxy_server, "general_settings", general_settings)
    # Make 2 requests thru the pass-through endpoint
    client.get("/bing/search?q=bob+barker")
    client.get("/bing/search-no-merge-params?q=bob+barker")
    first_transformed_url = captured_requests[0][1]["url"]
    second_transformed_url = captured_requests[1][1]["url"]
    # Assert the response
    assert (
        first_transformed_url
        == "https://api.bing.microsoft.com/v7.0/search?q=bob+barker&setLang=en-US&mkt=en-US"
        and second_transformed_url
        == "https://api.bing.microsoft.com/v7.0/search?setLang=en-US&mkt=en-US"
    )
--- a/tests/local_testing/test_router.py
+++ b/tests/local_testing/test_router.py
@ -194,6 +194,9 @@ def test_router_specific_model_via_id():
    router.completion(model="1234", messages=[{"role": "user", "content": "Hey!"}])
@pytest.mark.skip(
    reason="Router no longer creates clients, this is delegated to the provider integration."
 )
 def test_router_azure_ai_client_init():
    _deployment = {
@ -219,6 +222,9 @@ def test_router_azure_ai_client_init():
    assert not isinstance(_client, AsyncAzureOpenAI)
@pytest.mark.skip(
    reason="Router no longer creates clients, this is delegated to the provider integration."
 )
 def test_router_azure_ad_token_provider():
    _deployment = {
        "model_name": "gpt-4o_2024-05-13",
@ -247,8 +253,10 @@ def test_router_azure_ad_token_provider():
        assert isinstance(_client, AsyncAzureOpenAI)
        assert _client._azure_ad_token_provider is not None
        assert isinstance(_client._azure_ad_token_provider.__closure__, tuple)
-        assert isinstance(_client._azure_ad_token_provider.__closure__[0].cell_contents._credential,
+        assert isinstance(
-                        getattr(identity, os.environ["AZURE_CREDENTIAL"]))
+            _client._azure_ad_token_provider.__closure__[0].cell_contents._credential,
            getattr(identity, os.environ["AZURE_CREDENTIAL"]),
        )
 def test_router_sensitive_keys():
@ -312,91 +320,6 @@ def test_router_order():
        assert response._hidden_params["model_id"] == "1"
@pytest.mark.parametrize("num_retries", [None, 2])
@pytest.mark.parametrize("max_retries", [None, 4])
 def test_router_num_retries_init(num_retries, max_retries):
    """
    - test when num_retries set v/s not
    - test client value when max retries set v/s not
    """
    router = Router(
        model_list=[
            {
                "model_name": "gpt-3.5-turbo",  # openai model name
                "litellm_params": {  # params for litellm completion/embedding call
                    "model": "azure/chatgpt-v-2",
                    "api_key": "bad-key",
                    "api_version": os.getenv("AZURE_API_VERSION"),
                    "api_base": os.getenv("AZURE_API_BASE"),
                    "max_retries": max_retries,
                },
                "model_info": {"id": 12345},
            },
        ],
        num_retries=num_retries,
    )
    if num_retries is not None:
        assert router.num_retries == num_retries
    else:
        assert router.num_retries == openai.DEFAULT_MAX_RETRIES
    model_client = router._get_client(
        {"model_info": {"id": 12345}}, client_type="async", kwargs={}
    )
    if max_retries is not None:
        assert getattr(model_client, "max_retries") == max_retries
    else:
        assert getattr(model_client, "max_retries") == 0
@pytest.mark.parametrize(
    "timeout", [10, 1.0, httpx.Timeout(timeout=300.0, connect=20.0)]
 )
@pytest.mark.parametrize("ssl_verify", [True, False])
 def test_router_timeout_init(timeout, ssl_verify):
    """
    Allow user to pass httpx.Timeout
    related issue - https://github.com/BerriAI/litellm/issues/3162
    """
    litellm.ssl_verify = ssl_verify
    router = Router(
        model_list=[
            {
                "model_name": "test-model",
                "litellm_params": {
                    "model": "azure/chatgpt-v-2",
                    "api_key": os.getenv("AZURE_API_KEY"),
                    "api_base": os.getenv("AZURE_API_BASE"),
                    "api_version": os.getenv("AZURE_API_VERSION"),
                    "timeout": timeout,
                },
                "model_info": {"id": 1234},
            }
        ]
    )
    model_client = router._get_client(
        deployment={"model_info": {"id": 1234}}, client_type="sync_client", kwargs={}
    )
    assert getattr(model_client, "timeout") == timeout
    print(f"vars model_client: {vars(model_client)}")
    http_client = getattr(model_client, "_client")
    print(f"http client: {vars(http_client)}, ssl_Verify={ssl_verify}")
    if ssl_verify == False:
        assert http_client._transport._pool._ssl_context.verify_mode.name == "CERT_NONE"
    else:
        assert (
            http_client._transport._pool._ssl_context.verify_mode.name
            == "CERT_REQUIRED"
        )
@pytest.mark.parametrize("sync_mode", [False, True])
@pytest.mark.asyncio
 async def test_router_retries(sync_mode):
@ -445,6 +368,9 @@ async def test_router_retries(sync_mode):
        "https://Mistral-large-nmefg-serverless.eastus2.inference.ai.azure.com",
    ],
 )
@pytest.mark.skip(
    reason="Router no longer creates clients, this is delegated to the provider integration."
 )
 def test_router_azure_ai_studio_init(mistral_api_base):
    router = Router(
        model_list=[
@ -460,16 +386,21 @@ def test_router_azure_ai_studio_init(mistral_api_base):
        ]
    )
-    model_client = router._get_client(
+    # model_client = router._get_client(
-        deployment={"model_info": {"id": 1234}}, client_type="sync_client", kwargs={}
+    #     deployment={"model_info": {"id": 1234}}, client_type="sync_client", kwargs={}
    # )
    # url = getattr(model_client, "_base_url")
    # uri_reference = str(getattr(url, "_uri_reference"))
    # print(f"uri_reference: {uri_reference}")
    # assert "/v1/" in uri_reference
    # assert uri_reference.count("v1") == 1
    response = router.completion(
        model="azure/mistral-large-latest",
        messages=[{"role": "user", "content": "Hey, how's it going?"}],
    )
-    url = getattr(model_client, "_base_url")
+    assert response is not None
    uri_reference = str(getattr(url, "_uri_reference"))
    print(f"uri_reference: {uri_reference}")
    assert "/v1/" in uri_reference
    assert uri_reference.count("v1") == 1
 def test_exception_raising():
--- a/tests/local_testing/test_router_client_init.py
+++ b/tests/local_testing/test_router_client_init.py
@ -137,6 +137,7 @@ def test_router_init_azure_service_principal_with_secret_with_environment_variab
    mocked_os_lib: MagicMock,
    mocked_credential: MagicMock,
    mocked_get_bearer_token_provider: MagicMock,
    monkeypatch,
 ) -> None:
    """
    Test router initialization and sample completion using Azure Service Principal with Secret authentication workflow,
@ -145,6 +146,7 @@ def test_router_init_azure_service_principal_with_secret_with_environment_variab
    To allow for local testing without real credentials, first must mock Azure SDK authentication functions
    and environment variables.
    """
    monkeypatch.delenv("AZURE_API_KEY", raising=False)
    litellm.enable_azure_ad_token_refresh = True
    # mock the token provider function
    mocked_func_generating_token = MagicMock(return_value="test_token")
@ -182,25 +184,25 @@ def test_router_init_azure_service_principal_with_secret_with_environment_variab
    # initialize the router
    router = Router(model_list=model_list)
-    # first check if environment variables were used at all
+    # # first check if environment variables were used at all
-    mocked_environ.assert_called()
+    # mocked_environ.assert_called()
-    # then check if the client was initialized with the correct environment variables
+    # # then check if the client was initialized with the correct environment variables
-    mocked_credential.assert_called_with(
+    # mocked_credential.assert_called_with(
-        **{
+    #     **{
-            "client_id": environment_variables_expected_to_use["AZURE_CLIENT_ID"],
+    #         "client_id": environment_variables_expected_to_use["AZURE_CLIENT_ID"],
-            "client_secret": environment_variables_expected_to_use[
+    #         "client_secret": environment_variables_expected_to_use[
-                "AZURE_CLIENT_SECRET"
+    #             "AZURE_CLIENT_SECRET"
-            ],
+    #         ],
-            "tenant_id": environment_variables_expected_to_use["AZURE_TENANT_ID"],
+    #         "tenant_id": environment_variables_expected_to_use["AZURE_TENANT_ID"],
-        }
+    #     }
-    )
+    # )
-    # check if the token provider was called at all
+    # # check if the token provider was called at all
-    mocked_get_bearer_token_provider.assert_called()
+    # mocked_get_bearer_token_provider.assert_called()
-    # then check if the token provider was initialized with the mocked credential
+    # # then check if the token provider was initialized with the mocked credential
-    for call_args in mocked_get_bearer_token_provider.call_args_list:
+    # for call_args in mocked_get_bearer_token_provider.call_args_list:
-        assert call_args.args[0] == mocked_credential.return_value
+    #     assert call_args.args[0] == mocked_credential.return_value
-    # however, at this point token should not be fetched yet
+    # # however, at this point token should not be fetched yet
-    mocked_func_generating_token.assert_not_called()
+    # mocked_func_generating_token.assert_not_called()
    # now let's try to make a completion call
    deployment = model_list[0]
--- a/tests/local_testing/test_router_init.py
+++ b/tests/local_testing/test_router_init.py
--- a/tests/local_testing/test_router_utils.py
+++ b/tests/local_testing/test_router_utils.py
@ -418,3 +418,21 @@ def test_router_handle_clientside_credential():
    assert new_deployment.litellm_params.api_key == "123"
    assert len(router.get_model_list()) == 2
 def test_router_get_async_openai_model_client():
    router = Router(
        model_list=[
            {
                "model_name": "gemini/*",
                "litellm_params": {
                    "model": "gemini/*",
                    "api_base": "https://api.gemini.com",
                },
            }
        ]
    )
    model_client = router._get_async_openai_model_client(
        deployment=MagicMock(), kwargs={}
    )
    assert model_client is None
--- a/tests/router_unit_tests/test_router_endpoints.py
+++ b/tests/router_unit_tests/test_router_endpoints.py
@ -315,12 +315,18 @@ async def test_router_with_empty_choices(model_list):
    assert response is not None
-@pytest.mark.asyncio
+@pytest.mark.parametrize("sync_mode", [True, False])
-async def test_ageneric_api_call_with_fallbacks_basic():
+def test_generic_api_call_with_fallbacks_basic(sync_mode):
    """
-    Test the _ageneric_api_call_with_fallbacks method with a basic successful call
+    Test both the sync and async versions of generic_api_call_with_fallbacks with a basic successful call
    """
-    # Create a mock function that will be passed to _ageneric_api_call_with_fallbacks
+    # Create a mock function that will be passed to generic_api_call_with_fallbacks
    if sync_mode:
        from unittest.mock import Mock
        mock_function = Mock()
        mock_function.__name__ = "test_function"
    else:
        mock_function = AsyncMock()
        mock_function.__name__ = "test_function"
@ -347,13 +353,23 @@ async def test_ageneric_api_call_with_fallbacks_basic():
        ]
    )
-    # Call the _ageneric_api_call_with_fallbacks method
+    # Call the appropriate generic_api_call_with_fallbacks method
-    response = await router._ageneric_api_call_with_fallbacks(
+    if sync_mode:
        response = router._generic_api_call_with_fallbacks(
            model="test-model-alias",
            original_function=mock_function,
            messages=[{"role": "user", "content": "Hello"}],
            max_tokens=100,
        )
    else:
        response = asyncio.run(
            router._ageneric_api_call_with_fallbacks(
                model="test-model-alias",
                original_function=mock_function,
                messages=[{"role": "user", "content": "Hello"}],
                max_tokens=100,
            )
        )
    # Verify the mock function was called
    mock_function.assert_called_once()
@ -510,3 +526,36 @@ async def test__aadapter_completion():
        # Verify async_routing_strategy_pre_call_checks was called
        router.async_routing_strategy_pre_call_checks.assert_called_once()
 def test_initialize_router_endpoints():
    """
    Test that initialize_router_endpoints correctly sets up all router endpoints
    """
    # Create a router with a basic model
    router = Router(
        model_list=[
            {
                "model_name": "test-model",
                "litellm_params": {
                    "model": "anthropic/test-model",
                    "api_key": "fake-api-key",
                },
            }
        ]
    )
    # Explicitly call initialize_router_endpoints
    router.initialize_router_endpoints()
    # Verify all expected endpoints are initialized
    assert hasattr(router, "amoderation")
    assert hasattr(router, "aanthropic_messages")
    assert hasattr(router, "aresponses")
    assert hasattr(router, "responses")
    # Verify the endpoints are callable
    assert callable(router.amoderation)
    assert callable(router.aanthropic_messages)
    assert callable(router.aresponses)
    assert callable(router.responses)
--- a/Show more
+++ b/Show more