Merge branch 'BerriAI:main' into NolanTrem/GeminiSystemMessages

2025-04-25 18:54:30 +00:00 · 2025-04-15 13:38:12 -07:00 · 2025-04-15 13:38:12 -07:00 · adc4f161da
commit adc4f161da
parent 5bf2599456 6b5f093087
74 changed files with 2462 additions and 538 deletions
--- a/.gitignore
+++ b/.gitignore
@ -73,6 +73,7 @@ tests/local_testing/log.txt
 .codegpt
 litellm/proxy/_new_new_secret_config.yaml
 litellm/proxy/custom_guardrail.py
+.mypy_cache/*
 litellm/proxy/_experimental/out/404.html
 litellm/proxy/_experimental/out/404.html
 litellm/proxy/_experimental/out/model_hub.html
--- a/docs/my-website/docs/pass_through/cohere.md
+++ b/docs/my-website/docs/pass_through/cohere.md
@ -4,7 +4,7 @@ Pass-through endpoints for Cohere - call provider-specific endpoint, in native f

 | Feature | Supported | Notes | 
 |-------|-------|-------|
-| Cost Tracking | ❌ | [Tell us if you need this](https://github.com/BerriAI/litellm/issues/new) |
+| Cost Tracking | ✅ | works across all integrations |
 | Logging | ✅ | works across all integrations |
 | End-user Tracking | ❌ | [Tell us if you need this](https://github.com/BerriAI/litellm/issues/new) |
 | Streaming | ✅ | |
--- a/docs/my-website/docs/proxy/config_settings.md
+++ b/docs/my-website/docs/proxy/config_settings.md
@ -323,6 +323,9 @@ router_settings:
 | AZURE_AUTHORITY_HOST | Azure authority host URL
 | AZURE_CLIENT_ID | Client ID for Azure services
 | AZURE_CLIENT_SECRET | Client secret for Azure services
+| AZURE_TENANT_ID | Tenant ID for Azure Active Directory
+| AZURE_USERNAME | Username for Azure services, use in conjunction with AZURE_PASSWORD for azure ad token with basic username/password workflow
+| AZURE_PASSWORD | Password for Azure services, use in conjunction with AZURE_USERNAME for azure ad token with basic username/password workflow
 | AZURE_FEDERATED_TOKEN_FILE | File path to Azure federated token
 | AZURE_KEY_VAULT_URI | URI for Azure Key Vault
 | AZURE_STORAGE_ACCOUNT_KEY | The Azure Storage Account Key to use for Authentication to Azure Blob Storage logging
@ -331,7 +334,7 @@ router_settings:
 | AZURE_STORAGE_TENANT_ID | The Application Tenant ID to use for Authentication to Azure Blob Storage logging
 | AZURE_STORAGE_CLIENT_ID | The Application Client ID to use for Authentication to Azure Blob Storage logging
 | AZURE_STORAGE_CLIENT_SECRET | The Application Client Secret to use for Authentication to Azure Blob Storage logging
-| AZURE_TENANT_ID | Tenant ID for Azure Active Directory
+
 | BERRISPEND_ACCOUNT_ID | Account ID for BerriSpend service
 | BRAINTRUST_API_KEY | API key for Braintrust integration
 | CIRCLE_OIDC_TOKEN | OpenID Connect token for CircleCI
@ -433,6 +436,7 @@ router_settings:
 | LITERAL_BATCH_SIZE | Batch size for Literal operations
 | LITELLM_DONT_SHOW_FEEDBACK_BOX | Flag to hide feedback box in LiteLLM UI
 | LITELLM_DROP_PARAMS | Parameters to drop in LiteLLM requests
+| LITELLM_MODIFY_PARAMS | Parameters to modify in LiteLLM requests
 | LITELLM_EMAIL | Email associated with LiteLLM account
 | LITELLM_GLOBAL_MAX_PARALLEL_REQUEST_RETRIES | Maximum retries for parallel requests in LiteLLM
 | LITELLM_GLOBAL_MAX_PARALLEL_REQUEST_RETRY_TIMEOUT | Timeout for retries of parallel requests in LiteLLM
@ -446,6 +450,8 @@ router_settings:
 | LITELLM_TOKEN | Access token for LiteLLM integration
 | LITELLM_PRINT_STANDARD_LOGGING_PAYLOAD | If true, prints the standard logging payload to the console - useful for debugging
 | LOGFIRE_TOKEN | Token for Logfire logging service
+| MISTRAL_API_BASE | Base URL for Mistral API
+| MISTRAL_API_KEY | API key for Mistral API
 | MICROSOFT_CLIENT_ID | Client ID for Microsoft services
 | MICROSOFT_CLIENT_SECRET | Client secret for Microsoft services
 | MICROSOFT_TENANT | Tenant ID for Microsoft Azure
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -188,7 +188,7 @@ const sidebars = {
        "providers/azure_ai",
        "providers/aiml",
        "providers/vertex",
-        
+
        {
          type: "category",
          label: "Google AI Studio",
@ -340,7 +340,7 @@ const sidebars = {
        },
        "rerank",
        "assistants",
-        
+
        {
          type: "category",
          label: "/files",
@ -407,9 +407,9 @@ const sidebars = {
      type: "category",
      label: "Logging & Observability",
      items: [
+        "observability/langfuse_integration",
        "observability/lunary_integration",
        "observability/mlflow",
-        "observability/langfuse_integration",
        "observability/gcs_bucket_integration",
        "observability/langsmith_integration",
        "observability/literalai_integration",
--- a/litellm/init.py
+++ b/litellm/init.py
@ -113,6 +113,7 @@ _custom_logger_compatible_callbacks_literal = Literal[
    "pagerduty",
    "humanloop",
    "gcs_pubsub",
+    "anthropic_cache_control_hook",
 ]
 logged_real_time_event_types: Optional[Union[List[str], Literal["*"]]] = None
 _known_custom_logger_compatible_callbacks: List = list(
@ -127,19 +128,19 @@ prometheus_initialize_budget_metrics: Optional[bool] = False
 require_auth_for_metrics_endpoint: Optional[bool] = False
 argilla_batch_size: Optional[int] = None
 datadog_use_v1: Optional[bool] = False  # if you want to use v1 datadog logged payload
-gcs_pub_sub_use_v1: Optional[bool] = (
-    False  # if you want to use v1 gcs pubsub logged payload
-)
+gcs_pub_sub_use_v1: Optional[
+    bool
+] = False  # if you want to use v1 gcs pubsub logged payload
 argilla_transformation_object: Optional[Dict[str, Any]] = None
-_async_input_callback: List[Union[str, Callable, CustomLogger]] = (
-    []
-)  # internal variable - async custom callbacks are routed here.
-_async_success_callback: List[Union[str, Callable, CustomLogger]] = (
-    []
-)  # internal variable - async custom callbacks are routed here.
-_async_failure_callback: List[Union[str, Callable, CustomLogger]] = (
-    []
-)  # internal variable - async custom callbacks are routed here.
+_async_input_callback: List[
+    Union[str, Callable, CustomLogger]
+] = []  # internal variable - async custom callbacks are routed here.
+_async_success_callback: List[
+    Union[str, Callable, CustomLogger]
+] = []  # internal variable - async custom callbacks are routed here.
+_async_failure_callback: List[
+    Union[str, Callable, CustomLogger]
+] = []  # internal variable - async custom callbacks are routed here.
 pre_call_rules: List[Callable] = []
 post_call_rules: List[Callable] = []
 turn_off_message_logging: Optional[bool] = False
@ -147,22 +148,22 @@ log_raw_request_response: bool = False
 redact_messages_in_exceptions: Optional[bool] = False
 redact_user_api_key_info: Optional[bool] = False
 filter_invalid_headers: Optional[bool] = False
-add_user_information_to_llm_headers: Optional[bool] = (
-    None  # adds user_id, team_id, token hash (params from StandardLoggingMetadata) to request headers
-)
+add_user_information_to_llm_headers: Optional[
+    bool
+] = None  # adds user_id, team_id, token hash (params from StandardLoggingMetadata) to request headers
 store_audit_logs = False  # Enterprise feature, allow users to see audit logs
 ### end of callbacks #############

-email: Optional[str] = (
-    None  # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
-)
-token: Optional[str] = (
-    None  # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
-)
+email: Optional[
+    str
+] = None  # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
+token: Optional[
+    str
+] = None  # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
 telemetry = True
 max_tokens: int = DEFAULT_MAX_TOKENS  # OpenAI Defaults
 drop_params = bool(os.getenv("LITELLM_DROP_PARAMS", False))
-modify_params = False
+modify_params = bool(os.getenv("LITELLM_MODIFY_PARAMS", False))
 retry = True
 ### AUTH ###
 api_key: Optional[str] = None
@ -234,24 +235,20 @@ enable_loadbalancing_on_batch_endpoints: Optional[bool] = None
 enable_caching_on_provider_specific_optional_params: bool = (
    False  # feature-flag for caching on optional params - e.g. 'top_k'
 )
-caching: bool = (
-    False  # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
-)
-caching_with_models: bool = (
-    False  # # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
-)
-cache: Optional[Cache] = (
-    None  # cache object <- use this - https://docs.litellm.ai/docs/caching
-)
+caching: bool = False  # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
+caching_with_models: bool = False  # # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
+cache: Optional[
+    Cache
+] = None  # cache object <- use this - https://docs.litellm.ai/docs/caching
 default_in_memory_ttl: Optional[float] = None
 default_redis_ttl: Optional[float] = None
 default_redis_batch_cache_expiry: Optional[float] = None
 model_alias_map: Dict[str, str] = {}
 model_group_alias_map: Dict[str, str] = {}
 max_budget: float = 0.0  # set the max budget across all providers
-budget_duration: Optional[str] = (
-    None  # proxy only - resets budget after fixed duration. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
-)
+budget_duration: Optional[
+    str
+] = None  # proxy only - resets budget after fixed duration. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
 default_soft_budget: float = (
    DEFAULT_SOFT_BUDGET  # by default all litellm proxy keys have a soft budget of 50.0
 )
@ -260,15 +257,11 @@ forward_traceparent_to_llm_provider: bool = False

 _current_cost = 0.0  # private variable, used if max budget is set
 error_logs: Dict = {}
-add_function_to_prompt: bool = (
-    False  # if function calling not supported by api, append function call details to system prompt
-)
+add_function_to_prompt: bool = False  # if function calling not supported by api, append function call details to system prompt
 client_session: Optional[httpx.Client] = None
 aclient_session: Optional[httpx.AsyncClient] = None
 model_fallbacks: Optional[List] = None  # Deprecated for 'litellm.fallbacks'
-model_cost_map_url: str = (
-    "https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json"
-)
+model_cost_map_url: str = "https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json"
 suppress_debug_info = False
 dynamodb_table_name: Optional[str] = None
 s3_callback_params: Optional[Dict] = None
@ -291,9 +284,7 @@ disable_end_user_cost_tracking_prometheus_only: Optional[bool] = None
 custom_prometheus_metadata_labels: List[str] = []
 #### REQUEST PRIORITIZATION ####
 priority_reservation: Optional[Dict[str, float]] = None
-force_ipv4: bool = (
-    False  # when True, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6.
-)
+force_ipv4: bool = False  # when True, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6.
 module_level_aclient = AsyncHTTPHandler(
    timeout=request_timeout, client_alias="module level aclient"
 )
@ -307,13 +298,13 @@ fallbacks: Optional[List] = None
 context_window_fallbacks: Optional[List] = None
 content_policy_fallbacks: Optional[List] = None
 allowed_fails: int = 3
-num_retries_per_request: Optional[int] = (
-    None  # for the request overall (incl. fallbacks + model retries)
-)
+num_retries_per_request: Optional[
+    int
+] = None  # for the request overall (incl. fallbacks + model retries)
 ####### SECRET MANAGERS #####################
-secret_manager_client: Optional[Any] = (
-    None  # list of instantiated key management clients - e.g. azure kv, infisical, etc.
-)
+secret_manager_client: Optional[
+    Any
+] = None  # list of instantiated key management clients - e.g. azure kv, infisical, etc.
 _google_kms_resource_name: Optional[str] = None
 _key_management_system: Optional[KeyManagementSystem] = None
 _key_management_settings: KeyManagementSettings = KeyManagementSettings()
@ -324,6 +315,7 @@ from litellm.litellm_core_utils.get_model_cost_map import get_model_cost_map

 model_cost = get_model_cost_map(url=model_cost_map_url)
 custom_prompt_dict: Dict[str, dict] = {}
+check_provider_endpoint = False


 ####### THREAD-SPECIFIC DATA ####################
@ -1063,10 +1055,10 @@ from .types.llms.custom_llm import CustomLLMItem
 from .types.utils import GenericStreamingChunk

 custom_provider_map: List[CustomLLMItem] = []
-_custom_providers: List[str] = (
-    []
-)  # internal helper util, used to track names of custom providers
-disable_hf_tokenizer_download: Optional[bool] = (
-    None  # disable huggingface tokenizer download. Defaults to openai clk100
-)
+_custom_providers: List[
+    str
+] = []  # internal helper util, used to track names of custom providers
+disable_hf_tokenizer_download: Optional[
+    bool
+] = None  # disable huggingface tokenizer download. Defaults to openai clk100
 global_disable_no_log_param: bool = False
--- a/litellm/integrations/anthropic_cache_control_hook.py
+++ b/litellm/integrations/anthropic_cache_control_hook.py
@ -0,0 +1,150 @@
+"""
+This hook is used to inject cache control directives into the messages of a chat completion.
+
+Users can define
+- `cache_control_injection_points` in the completion params and litellm will inject the cache control directives into the messages at the specified injection points.
+
+"""
+
+import copy
+from typing import Dict, List, Optional, Tuple, Union, cast
+
+from litellm.integrations.custom_logger import CustomLogger
+from litellm.integrations.custom_prompt_management import CustomPromptManagement
+from litellm.types.integrations.anthropic_cache_control_hook import (
+    CacheControlInjectionPoint,
+    CacheControlMessageInjectionPoint,
+)
+from litellm.types.llms.openai import AllMessageValues, ChatCompletionCachedContent
+from litellm.types.utils import StandardCallbackDynamicParams
+
+
+class AnthropicCacheControlHook(CustomPromptManagement):
+    def get_chat_completion_prompt(
+        self,
+        model: str,
+        messages: List[AllMessageValues],
+        non_default_params: dict,
+        prompt_id: Optional[str],
+        prompt_variables: Optional[dict],
+        dynamic_callback_params: StandardCallbackDynamicParams,
+    ) -> Tuple[str, List[AllMessageValues], dict]:
+        """
+        Apply cache control directives based on specified injection points.
+
+        Returns:
+        - model: str - the model to use
+        - messages: List[AllMessageValues] - messages with applied cache controls
+        - non_default_params: dict - params with any global cache controls
+        """
+        # Extract cache control injection points
+        injection_points: List[CacheControlInjectionPoint] = non_default_params.pop(
+            "cache_control_injection_points", []
+        )
+        if not injection_points:
+            return model, messages, non_default_params
+
+        # Create a deep copy of messages to avoid modifying the original list
+        processed_messages = copy.deepcopy(messages)
+
+        # Process message-level cache controls
+        for point in injection_points:
+            if point.get("location") == "message":
+                point = cast(CacheControlMessageInjectionPoint, point)
+                processed_messages = self._process_message_injection(
+                    point=point, messages=processed_messages
+                )
+
+        return model, processed_messages, non_default_params
+
+    @staticmethod
+    def _process_message_injection(
+        point: CacheControlMessageInjectionPoint, messages: List[AllMessageValues]
+    ) -> List[AllMessageValues]:
+        """Process message-level cache control injection."""
+        control: ChatCompletionCachedContent = point.get(
+            "control", None
+        ) or ChatCompletionCachedContent(type="ephemeral")
+
+        _targetted_index: Optional[Union[int, str]] = point.get("index", None)
+        targetted_index: Optional[int] = None
+        if isinstance(_targetted_index, str):
+            if _targetted_index.isdigit():
+                targetted_index = int(_targetted_index)
+        else:
+            targetted_index = _targetted_index
+
+        targetted_role = point.get("role", None)
+
+        # Case 1: Target by specific index
+        if targetted_index is not None:
+            if 0 <= targetted_index < len(messages):
+                messages[targetted_index] = (
+                    AnthropicCacheControlHook._safe_insert_cache_control_in_message(
+                        messages[targetted_index], control
+                    )
+                )
+        # Case 2: Target by role
+        elif targetted_role is not None:
+            for msg in messages:
+                if msg.get("role") == targetted_role:
+                    msg = (
+                        AnthropicCacheControlHook._safe_insert_cache_control_in_message(
+                            message=msg, control=control
+                        )
+                    )
+        return messages
+
+    @staticmethod
+    def _safe_insert_cache_control_in_message(
+        message: AllMessageValues, control: ChatCompletionCachedContent
+    ) -> AllMessageValues:
+        """
+        Safe way to insert cache control in a message
+
+        OpenAI Message content can be either:
+            - string
+            - list of objects
+
+        This method handles inserting cache control in both cases.
+        """
+        message_content = message.get("content", None)
+
+        # 1. if string, insert cache control in the message
+        if isinstance(message_content, str):
+            message["cache_control"] = control  # type: ignore
+        # 2. list of objects
+        elif isinstance(message_content, list):
+            for content_item in message_content:
+                if isinstance(content_item, dict):
+                    content_item["cache_control"] = control  # type: ignore
+        return message
+
+    @property
+    def integration_name(self) -> str:
+        """Return the integration name for this hook."""
+        return "anthropic_cache_control_hook"
+
+    @staticmethod
+    def should_use_anthropic_cache_control_hook(non_default_params: Dict) -> bool:
+        if non_default_params.get("cache_control_injection_points", None):
+            return True
+        return False
+
+    @staticmethod
+    def get_custom_logger_for_anthropic_cache_control_hook(
+        non_default_params: Dict,
+    ) -> Optional[CustomLogger]:
+        from litellm.litellm_core_utils.litellm_logging import (
+            _init_custom_logger_compatible_class,
+        )
+
+        if AnthropicCacheControlHook.should_use_anthropic_cache_control_hook(
+            non_default_params
+        ):
+            return _init_custom_logger_compatible_class(
+                logging_integration="anthropic_cache_control_hook",
+                internal_usage_cache=None,
+                llm_router=None,
+            )
+        return None
--- a/litellm/integrations/custom_logger.py
+++ b/litellm/integrations/custom_logger.py
@ -94,7 +94,7 @@ class CustomLogger:  # https://docs.litellm.ai/docs/observability/custom_callbac
        model: str,
        messages: List[AllMessageValues],
        non_default_params: dict,
-        prompt_id: str,
+        prompt_id: Optional[str],
        prompt_variables: Optional[dict],
        dynamic_callback_params: StandardCallbackDynamicParams,
    ) -> Tuple[str, List[AllMessageValues], dict]:
--- a/litellm/integrations/custom_prompt_management.py
+++ b/litellm/integrations/custom_prompt_management.py
@ -15,7 +15,7 @@ class CustomPromptManagement(CustomLogger, PromptManagementBase):
        model: str,
        messages: List[AllMessageValues],
        non_default_params: dict,
-        prompt_id: str,
+        prompt_id: Optional[str],
        prompt_variables: Optional[dict],
        dynamic_callback_params: StandardCallbackDynamicParams,
    ) -> Tuple[str, List[AllMessageValues], dict]:
--- a/litellm/integrations/humanloop.py
+++ b/litellm/integrations/humanloop.py
@ -152,14 +152,21 @@ class HumanloopLogger(CustomLogger):
        model: str,
        messages: List[AllMessageValues],
        non_default_params: dict,
-        prompt_id: str,
+        prompt_id: Optional[str],
        prompt_variables: Optional[dict],
        dynamic_callback_params: StandardCallbackDynamicParams,
-    ) -> Tuple[str, List[AllMessageValues], dict,]:
+    ) -> Tuple[
+        str,
+        List[AllMessageValues],
+        dict,
+    ]:
        humanloop_api_key = dynamic_callback_params.get(
            "humanloop_api_key"
        ) or get_secret_str("HUMANLOOP_API_KEY")

+        if prompt_id is None:
+            raise ValueError("prompt_id is required for Humanloop integration")
+
        if humanloop_api_key is None:
            return super().get_chat_completion_prompt(
                model=model,
--- a/litellm/integrations/langfuse/langfuse_prompt_management.py
+++ b/litellm/integrations/langfuse/langfuse_prompt_management.py
@ -169,10 +169,14 @@ class LangfusePromptManagement(LangFuseLogger, PromptManagementBase, CustomLogge
        model: str,
        messages: List[AllMessageValues],
        non_default_params: dict,
-        prompt_id: str,
+        prompt_id: Optional[str],
        prompt_variables: Optional[dict],
        dynamic_callback_params: StandardCallbackDynamicParams,
-    ) -> Tuple[str, List[AllMessageValues], dict,]:
+    ) -> Tuple[
+        str,
+        List[AllMessageValues],
+        dict,
+    ]:
        return self.get_chat_completion_prompt(
            model,
            messages,
--- a/litellm/integrations/prompt_management_base.py
+++ b/litellm/integrations/prompt_management_base.py
@ -79,10 +79,12 @@ class PromptManagementBase(ABC):
        model: str,
        messages: List[AllMessageValues],
        non_default_params: dict,
-        prompt_id: str,
+        prompt_id: Optional[str],
        prompt_variables: Optional[dict],
        dynamic_callback_params: StandardCallbackDynamicParams,
-    ) -> Tuple[str, List[AllMessageValues], dict,]:
+    ) -> Tuple[str, List[AllMessageValues], dict]:
+        if prompt_id is None:
+            raise ValueError("prompt_id is required for Prompt Management Base class")
        if not self.should_run_prompt_management(
            prompt_id=prompt_id, dynamic_callback_params=dynamic_callback_params
        ):
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@ -36,6 +36,7 @@ from litellm.cost_calculator import (
    RealtimeAPITokenUsageProcessor,
    _select_model_name_for_cost_calc,
 )
+from litellm.integrations.anthropic_cache_control_hook import AnthropicCacheControlHook
 from litellm.integrations.arize.arize import ArizeLogger
 from litellm.integrations.custom_guardrail import CustomGuardrail
 from litellm.integrations.custom_logger import CustomLogger
@ -249,9 +250,9 @@ class Logging(LiteLLMLoggingBaseClass):
        self.litellm_trace_id = litellm_trace_id
        self.function_id = function_id
        self.streaming_chunks: List[Any] = []  # for generating complete stream response
-        self.sync_streaming_chunks: List[
-            Any
-        ] = []  # for generating complete stream response
+        self.sync_streaming_chunks: List[Any] = (
+            []
+        )  # for generating complete stream response
        self.log_raw_request_response = log_raw_request_response

        # Initialize dynamic callbacks
@ -455,19 +456,38 @@ class Logging(LiteLLMLoggingBaseClass):
        if "custom_llm_provider" in self.model_call_details:
            self.custom_llm_provider = self.model_call_details["custom_llm_provider"]

+    def should_run_prompt_management_hooks(
+        self,
+        non_default_params: Dict,
+        prompt_id: Optional[str] = None,
+    ) -> bool:
+        """
+        Return True if prompt management hooks should be run
+        """
+        if prompt_id:
+            return True
+        if AnthropicCacheControlHook.should_use_anthropic_cache_control_hook(
+            non_default_params
+        ):
+            return True
+        return False
+
    def get_chat_completion_prompt(
        self,
        model: str,
        messages: List[AllMessageValues],
-        non_default_params: dict,
-        prompt_id: str,
+        non_default_params: Dict,
+        prompt_id: Optional[str],
        prompt_variables: Optional[dict],
        prompt_management_logger: Optional[CustomLogger] = None,
    ) -> Tuple[str, List[AllMessageValues], dict]:
        custom_logger = (
            prompt_management_logger
-            or self.get_custom_logger_for_prompt_management(model)
+            or self.get_custom_logger_for_prompt_management(
+                model=model, non_default_params=non_default_params
+            )
        )
+
        if custom_logger:
            (
                model,
@ -476,7 +496,7 @@ class Logging(LiteLLMLoggingBaseClass):
            ) = custom_logger.get_chat_completion_prompt(
                model=model,
                messages=messages,
-                non_default_params=non_default_params,
+                non_default_params=non_default_params or {},
                prompt_id=prompt_id,
                prompt_variables=prompt_variables,
                dynamic_callback_params=self.standard_callback_dynamic_params,
@ -485,7 +505,7 @@ class Logging(LiteLLMLoggingBaseClass):
        return model, messages, non_default_params

    def get_custom_logger_for_prompt_management(
-        self, model: str
+        self, model: str, non_default_params: Dict
    ) -> Optional[CustomLogger]:
        """
        Get a custom logger for prompt management based on model name or available callbacks.
@ -520,6 +540,26 @@ class Logging(LiteLLMLoggingBaseClass):
            self.model_call_details["prompt_integration"] = logger.__class__.__name__
            return logger

+        if anthropic_cache_control_logger := AnthropicCacheControlHook.get_custom_logger_for_anthropic_cache_control_hook(
+            non_default_params
+        ):
+            self.model_call_details["prompt_integration"] = (
+                anthropic_cache_control_logger.__class__.__name__
+            )
+            return anthropic_cache_control_logger
+
+        return None
+
+    def get_custom_logger_for_anthropic_cache_control_hook(
+        self, non_default_params: Dict
+    ) -> Optional[CustomLogger]:
+        if non_default_params.get("cache_control_injection_points", None):
+            custom_logger = _init_custom_logger_compatible_class(
+                logging_integration="anthropic_cache_control_hook",
+                internal_usage_cache=None,
+                llm_router=None,
+            )
+            return custom_logger
        return None

    def _get_raw_request_body(self, data: Optional[Union[dict, str]]) -> dict:
@ -557,9 +597,9 @@ class Logging(LiteLLMLoggingBaseClass):
            model
        ):  # if model name was changes pre-call, overwrite the initial model call name with the new one
            self.model_call_details["model"] = model
-        self.model_call_details["litellm_params"][
-            "api_base"
-        ] = self._get_masked_api_base(additional_args.get("api_base", ""))
+        self.model_call_details["litellm_params"]["api_base"] = (
+            self._get_masked_api_base(additional_args.get("api_base", ""))
+        )

    def pre_call(self, input, api_key, model=None, additional_args={}):  # noqa: PLR0915
        # Log the exact input to the LLM API
@ -588,10 +628,10 @@ class Logging(LiteLLMLoggingBaseClass):
                try:
                    # [Non-blocking Extra Debug Information in metadata]
                    if turn_off_message_logging is True:
-                        _metadata[
-                            "raw_request"
-                        ] = "redacted by litellm. \
+                        _metadata["raw_request"] = (
+                            "redacted by litellm. \
                            'litellm.turn_off_message_logging=True'"
+                        )
                    else:
                        curl_command = self._get_request_curl_command(
                            api_base=additional_args.get("api_base", ""),
@ -602,32 +642,32 @@ class Logging(LiteLLMLoggingBaseClass):

                        _metadata["raw_request"] = str(curl_command)
                        # split up, so it's easier to parse in the UI
-                        self.model_call_details[
-                            "raw_request_typed_dict"
-                        ] = RawRequestTypedDict(
-                            raw_request_api_base=str(
-                                additional_args.get("api_base") or ""
-                            ),
-                            raw_request_body=self._get_raw_request_body(
-                                additional_args.get("complete_input_dict", {})
-                            ),
-                            raw_request_headers=self._get_masked_headers(
-                                additional_args.get("headers", {}) or {},
-                                ignore_sensitive_headers=True,
-                            ),
-                            error=None,
+                        self.model_call_details["raw_request_typed_dict"] = (
+                            RawRequestTypedDict(
+                                raw_request_api_base=str(
+                                    additional_args.get("api_base") or ""
+                                ),
+                                raw_request_body=self._get_raw_request_body(
+                                    additional_args.get("complete_input_dict", {})
+                                ),
+                                raw_request_headers=self._get_masked_headers(
+                                    additional_args.get("headers", {}) or {},
+                                    ignore_sensitive_headers=True,
+                                ),
+                                error=None,
+                            )
                        )
                except Exception as e:
-                    self.model_call_details[
-                        "raw_request_typed_dict"
-                    ] = RawRequestTypedDict(
-                        error=str(e),
+                    self.model_call_details["raw_request_typed_dict"] = (
+                        RawRequestTypedDict(
+                            error=str(e),
+                        )
                    )
-                    _metadata[
-                        "raw_request"
-                    ] = "Unable to Log \
+                    _metadata["raw_request"] = (
+                        "Unable to Log \
                        raw request: {}".format(
-                        str(e)
+                            str(e)
+                        )
                    )
            if self.logger_fn and callable(self.logger_fn):
                try:
@ -957,9 +997,9 @@ class Logging(LiteLLMLoggingBaseClass):
            verbose_logger.debug(
                f"response_cost_failure_debug_information: {debug_info}"
            )
-            self.model_call_details[
-                "response_cost_failure_debug_information"
-            ] = debug_info
+            self.model_call_details["response_cost_failure_debug_information"] = (
+                debug_info
+            )
            return None

        try:
@ -984,9 +1024,9 @@ class Logging(LiteLLMLoggingBaseClass):
            verbose_logger.debug(
                f"response_cost_failure_debug_information: {debug_info}"
            )
-            self.model_call_details[
-                "response_cost_failure_debug_information"
-            ] = debug_info
+            self.model_call_details["response_cost_failure_debug_information"] = (
+                debug_info
+            )

        return None

@ -1046,9 +1086,9 @@ class Logging(LiteLLMLoggingBaseClass):
                end_time = datetime.datetime.now()
            if self.completion_start_time is None:
                self.completion_start_time = end_time
-                self.model_call_details[
-                    "completion_start_time"
-                ] = self.completion_start_time
+                self.model_call_details["completion_start_time"] = (
+                    self.completion_start_time
+                )
            self.model_call_details["log_event_type"] = "successful_api_call"
            self.model_call_details["end_time"] = end_time
            self.model_call_details["cache_hit"] = cache_hit
@ -1127,39 +1167,39 @@ class Logging(LiteLLMLoggingBaseClass):
                            "response_cost"
                        ]
                    else:
-                        self.model_call_details[
-                            "response_cost"
-                        ] = self._response_cost_calculator(result=logging_result)
+                        self.model_call_details["response_cost"] = (
+                            self._response_cost_calculator(result=logging_result)
+                        )
                    ## STANDARDIZED LOGGING PAYLOAD

-                    self.model_call_details[
-                        "standard_logging_object"
-                    ] = get_standard_logging_object_payload(
-                        kwargs=self.model_call_details,
-                        init_response_obj=logging_result,
-                        start_time=start_time,
-                        end_time=end_time,
-                        logging_obj=self,
-                        status="success",
-                        standard_built_in_tools_params=self.standard_built_in_tools_params,
+                    self.model_call_details["standard_logging_object"] = (
+                        get_standard_logging_object_payload(
+                            kwargs=self.model_call_details,
+                            init_response_obj=logging_result,
+                            start_time=start_time,
+                            end_time=end_time,
+                            logging_obj=self,
+                            status="success",
+                            standard_built_in_tools_params=self.standard_built_in_tools_params,
+                        )
                    )
                elif isinstance(result, dict) or isinstance(result, list):
                    ## STANDARDIZED LOGGING PAYLOAD
-                    self.model_call_details[
-                        "standard_logging_object"
-                    ] = get_standard_logging_object_payload(
-                        kwargs=self.model_call_details,
-                        init_response_obj=result,
-                        start_time=start_time,
-                        end_time=end_time,
-                        logging_obj=self,
-                        status="success",
-                        standard_built_in_tools_params=self.standard_built_in_tools_params,
+                    self.model_call_details["standard_logging_object"] = (
+                        get_standard_logging_object_payload(
+                            kwargs=self.model_call_details,
+                            init_response_obj=result,
+                            start_time=start_time,
+                            end_time=end_time,
+                            logging_obj=self,
+                            status="success",
+                            standard_built_in_tools_params=self.standard_built_in_tools_params,
+                        )
                    )
            elif standard_logging_object is not None:
-                self.model_call_details[
-                    "standard_logging_object"
-                ] = standard_logging_object
+                self.model_call_details["standard_logging_object"] = (
+                    standard_logging_object
+                )
            else:  # streaming chunks + image gen.
                self.model_call_details["response_cost"] = None

@ -1215,23 +1255,23 @@ class Logging(LiteLLMLoggingBaseClass):
                verbose_logger.debug(
                    "Logging Details LiteLLM-Success Call streaming complete"
                )
-                self.model_call_details[
-                    "complete_streaming_response"
-                ] = complete_streaming_response
-                self.model_call_details[
-                    "response_cost"
-                ] = self._response_cost_calculator(result=complete_streaming_response)
+                self.model_call_details["complete_streaming_response"] = (
+                    complete_streaming_response
+                )
+                self.model_call_details["response_cost"] = (
+                    self._response_cost_calculator(result=complete_streaming_response)
+                )
                ## STANDARDIZED LOGGING PAYLOAD
-                self.model_call_details[
-                    "standard_logging_object"
-                ] = get_standard_logging_object_payload(
-                    kwargs=self.model_call_details,
-                    init_response_obj=complete_streaming_response,
-                    start_time=start_time,
-                    end_time=end_time,
-                    logging_obj=self,
-                    status="success",
-                    standard_built_in_tools_params=self.standard_built_in_tools_params,
+                self.model_call_details["standard_logging_object"] = (
+                    get_standard_logging_object_payload(
+                        kwargs=self.model_call_details,
+                        init_response_obj=complete_streaming_response,
+                        start_time=start_time,
+                        end_time=end_time,
+                        logging_obj=self,
+                        status="success",
+                        standard_built_in_tools_params=self.standard_built_in_tools_params,
+                    )
                )
            callbacks = self.get_combined_callback_list(
                dynamic_success_callbacks=self.dynamic_success_callbacks,
@ -1580,10 +1620,10 @@ class Logging(LiteLLMLoggingBaseClass):
                            )
                        else:
                            if self.stream and complete_streaming_response:
-                                self.model_call_details[
-                                    "complete_response"
-                                ] = self.model_call_details.get(
-                                    "complete_streaming_response", {}
+                                self.model_call_details["complete_response"] = (
+                                    self.model_call_details.get(
+                                        "complete_streaming_response", {}
+                                    )
                                )
                                result = self.model_call_details["complete_response"]
                            openMeterLogger.log_success_event(
@ -1623,10 +1663,10 @@ class Logging(LiteLLMLoggingBaseClass):
                            )
                        else:
                            if self.stream and complete_streaming_response:
-                                self.model_call_details[
-                                    "complete_response"
-                                ] = self.model_call_details.get(
-                                    "complete_streaming_response", {}
+                                self.model_call_details["complete_response"] = (
+                                    self.model_call_details.get(
+                                        "complete_streaming_response", {}
+                                    )
                                )
                                result = self.model_call_details["complete_response"]

@ -1733,9 +1773,9 @@ class Logging(LiteLLMLoggingBaseClass):
        if complete_streaming_response is not None:
            print_verbose("Async success callbacks: Got a complete streaming response")

-            self.model_call_details[
-                "async_complete_streaming_response"
-            ] = complete_streaming_response
+            self.model_call_details["async_complete_streaming_response"] = (
+                complete_streaming_response
+            )
            try:
                if self.model_call_details.get("cache_hit", False) is True:
                    self.model_call_details["response_cost"] = 0.0
@ -1745,10 +1785,10 @@ class Logging(LiteLLMLoggingBaseClass):
                        model_call_details=self.model_call_details
                    )
                    # base_model defaults to None if not set on model_info
-                    self.model_call_details[
-                        "response_cost"
-                    ] = self._response_cost_calculator(
-                        result=complete_streaming_response
+                    self.model_call_details["response_cost"] = (
+                        self._response_cost_calculator(
+                            result=complete_streaming_response
+                        )
                    )

                verbose_logger.debug(
@ -1761,16 +1801,16 @@ class Logging(LiteLLMLoggingBaseClass):
                self.model_call_details["response_cost"] = None

            ## STANDARDIZED LOGGING PAYLOAD
-            self.model_call_details[
-                "standard_logging_object"
-            ] = get_standard_logging_object_payload(
-                kwargs=self.model_call_details,
-                init_response_obj=complete_streaming_response,
-                start_time=start_time,
-                end_time=end_time,
-                logging_obj=self,
-                status="success",
-                standard_built_in_tools_params=self.standard_built_in_tools_params,
+            self.model_call_details["standard_logging_object"] = (
+                get_standard_logging_object_payload(
+                    kwargs=self.model_call_details,
+                    init_response_obj=complete_streaming_response,
+                    start_time=start_time,
+                    end_time=end_time,
+                    logging_obj=self,
+                    status="success",
+                    standard_built_in_tools_params=self.standard_built_in_tools_params,
+                )
            )
        callbacks = self.get_combined_callback_list(
            dynamic_success_callbacks=self.dynamic_async_success_callbacks,
@ -1976,18 +2016,18 @@ class Logging(LiteLLMLoggingBaseClass):

        ## STANDARDIZED LOGGING PAYLOAD

-        self.model_call_details[
-            "standard_logging_object"
-        ] = get_standard_logging_object_payload(
-            kwargs=self.model_call_details,
-            init_response_obj={},
-            start_time=start_time,
-            end_time=end_time,
-            logging_obj=self,
-            status="failure",
-            error_str=str(exception),
-            original_exception=exception,
-            standard_built_in_tools_params=self.standard_built_in_tools_params,
+        self.model_call_details["standard_logging_object"] = (
+            get_standard_logging_object_payload(
+                kwargs=self.model_call_details,
+                init_response_obj={},
+                start_time=start_time,
+                end_time=end_time,
+                logging_obj=self,
+                status="failure",
+                error_str=str(exception),
+                original_exception=exception,
+                standard_built_in_tools_params=self.standard_built_in_tools_params,
+            )
        )
        return start_time, end_time

@ -2753,9 +2793,9 @@ def _init_custom_logger_compatible_class(  # noqa: PLR0915
                endpoint=arize_config.endpoint,
            )

-            os.environ[
-                "OTEL_EXPORTER_OTLP_TRACES_HEADERS"
-            ] = f"space_key={arize_config.space_key},api_key={arize_config.api_key}"
+            os.environ["OTEL_EXPORTER_OTLP_TRACES_HEADERS"] = (
+                f"space_key={arize_config.space_key},api_key={arize_config.api_key}"
+            )
            for callback in _in_memory_loggers:
                if (
                    isinstance(callback, ArizeLogger)
@ -2779,9 +2819,9 @@ def _init_custom_logger_compatible_class(  # noqa: PLR0915

            # auth can be disabled on local deployments of arize phoenix
            if arize_phoenix_config.otlp_auth_headers is not None:
-                os.environ[
-                    "OTEL_EXPORTER_OTLP_TRACES_HEADERS"
-                ] = arize_phoenix_config.otlp_auth_headers
+                os.environ["OTEL_EXPORTER_OTLP_TRACES_HEADERS"] = (
+                    arize_phoenix_config.otlp_auth_headers
+                )

            for callback in _in_memory_loggers:
                if (
@ -2872,9 +2912,9 @@ def _init_custom_logger_compatible_class(  # noqa: PLR0915
                exporter="otlp_http",
                endpoint="https://langtrace.ai/api/trace",
            )
-            os.environ[
-                "OTEL_EXPORTER_OTLP_TRACES_HEADERS"
-            ] = f"api_key={os.getenv('LANGTRACE_API_KEY')}"
+            os.environ["OTEL_EXPORTER_OTLP_TRACES_HEADERS"] = (
+                f"api_key={os.getenv('LANGTRACE_API_KEY')}"
+            )
            for callback in _in_memory_loggers:
                if (
                    isinstance(callback, OpenTelemetry)
@ -2908,6 +2948,13 @@ def _init_custom_logger_compatible_class(  # noqa: PLR0915
            pagerduty_logger = PagerDutyAlerting(**custom_logger_init_args)
            _in_memory_loggers.append(pagerduty_logger)
            return pagerduty_logger  # type: ignore
+        elif logging_integration == "anthropic_cache_control_hook":
+            for callback in _in_memory_loggers:
+                if isinstance(callback, AnthropicCacheControlHook):
+                    return callback
+            anthropic_cache_control_hook = AnthropicCacheControlHook()
+            _in_memory_loggers.append(anthropic_cache_control_hook)
+            return anthropic_cache_control_hook  # type: ignore
        elif logging_integration == "gcs_pubsub":
            for callback in _in_memory_loggers:
                if isinstance(callback, GcsPubSubLogger):
@ -3046,6 +3093,10 @@ def get_custom_logger_compatible_class(  # noqa: PLR0915
            for callback in _in_memory_loggers:
                if isinstance(callback, PagerDutyAlerting):
                    return callback
+        elif logging_integration == "anthropic_cache_control_hook":
+            for callback in _in_memory_loggers:
+                if isinstance(callback, AnthropicCacheControlHook):
+                    return callback
        elif logging_integration == "gcs_pubsub":
            for callback in _in_memory_loggers:
                if isinstance(callback, GcsPubSubLogger):
@ -3369,10 +3420,10 @@ class StandardLoggingPayloadSetup:
            for key in StandardLoggingHiddenParams.__annotations__.keys():
                if key in hidden_params:
                    if key == "additional_headers":
-                        clean_hidden_params[
-                            "additional_headers"
-                        ] = StandardLoggingPayloadSetup.get_additional_headers(
-                            hidden_params[key]
+                        clean_hidden_params["additional_headers"] = (
+                            StandardLoggingPayloadSetup.get_additional_headers(
+                                hidden_params[key]
+                            )
                        )
                    else:
                        clean_hidden_params[key] = hidden_params[key]  # type: ignore
@ -3651,7 +3702,7 @@ def emit_standard_logging_payload(payload: StandardLoggingPayload):


 def get_standard_logging_metadata(
-    metadata: Optional[Dict[str, Any]]
+    metadata: Optional[Dict[str, Any]],
 ) -> StandardLoggingMetadata:
    """
    Clean and filter the metadata dictionary to include only the specified keys in StandardLoggingMetadata.
@ -3715,9 +3766,9 @@ def scrub_sensitive_keys_in_metadata(litellm_params: Optional[dict]):
    ):
        for k, v in metadata["user_api_key_metadata"].items():
            if k == "logging":  # prevent logging user logging keys
-                cleaned_user_api_key_metadata[
-                    k
-                ] = "scrubbed_by_litellm_for_sensitive_keys"
+                cleaned_user_api_key_metadata[k] = (
+                    "scrubbed_by_litellm_for_sensitive_keys"
+                )
            else:
                cleaned_user_api_key_metadata[k] = v

--- a/litellm/llms/anthropic/chat/transformation.py
+++ b/litellm/llms/anthropic/chat/transformation.py
@ -44,7 +44,7 @@ from litellm.utils import (
    token_counter,
 )

-from ..common_utils import AnthropicError, process_anthropic_headers
+from ..common_utils import AnthropicError, AnthropicModelInfo, process_anthropic_headers

 if TYPE_CHECKING:
    from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
@ -54,7 +54,7 @@ else:
    LoggingClass = Any


-class AnthropicConfig(BaseConfig):
+class AnthropicConfig(AnthropicModelInfo, BaseConfig):
    """
    Reference: https://docs.anthropic.com/claude/reference/messages_post

@ -127,41 +127,6 @@ class AnthropicConfig(BaseConfig):
            "anthropic-beta": "prompt-caching-2024-07-31",
        }

-    def get_anthropic_headers(
-        self,
-        api_key: str,
-        anthropic_version: Optional[str] = None,
-        computer_tool_used: bool = False,
-        prompt_caching_set: bool = False,
-        pdf_used: bool = False,
-        is_vertex_request: bool = False,
-        user_anthropic_beta_headers: Optional[List[str]] = None,
-    ) -> dict:
-        betas = set()
-        if prompt_caching_set:
-            betas.add("prompt-caching-2024-07-31")
-        if computer_tool_used:
-            betas.add("computer-use-2024-10-22")
-        if pdf_used:
-            betas.add("pdfs-2024-09-25")
-        headers = {
-            "anthropic-version": anthropic_version or "2023-06-01",
-            "x-api-key": api_key,
-            "accept": "application/json",
-            "content-type": "application/json",
-        }
-
-        if user_anthropic_beta_headers is not None:
-            betas.update(user_anthropic_beta_headers)
-
-        # Don't send any beta headers to Vertex, Vertex has failed requests when they are sent
-        if is_vertex_request is True:
-            pass
-        elif len(betas) > 0:
-            headers["anthropic-beta"] = ",".join(betas)
-
-        return headers
-
    def _map_tool_choice(
        self, tool_choice: Optional[str], parallel_tool_use: Optional[bool]
    ) -> Optional[AnthropicMessagesToolChoice]:
@ -446,49 +411,6 @@ class AnthropicConfig(BaseConfig):
        )
        return _tool

-    def is_cache_control_set(self, messages: List[AllMessageValues]) -> bool:
-        """
-        Return if {"cache_control": ..} in message content block
-
-        Used to check if anthropic prompt caching headers need to be set.
-        """
-        for message in messages:
-            if message.get("cache_control", None) is not None:
-                return True
-            _message_content = message.get("content")
-            if _message_content is not None and isinstance(_message_content, list):
-                for content in _message_content:
-                    if "cache_control" in content:
-                        return True
-
-        return False
-
-    def is_computer_tool_used(
-        self, tools: Optional[List[AllAnthropicToolsValues]]
-    ) -> bool:
-        if tools is None:
-            return False
-        for tool in tools:
-            if "type" in tool and tool["type"].startswith("computer_"):
-                return True
-        return False
-
-    def is_pdf_used(self, messages: List[AllMessageValues]) -> bool:
-        """
-        Set to true if media passed into messages.
-
-        """
-        for message in messages:
-            if (
-                "content" in message
-                and message["content"] is not None
-                and isinstance(message["content"], list)
-            ):
-                for content in message["content"]:
-                    if "type" in content and content["type"] != "text":
-                        return True
-        return False
-
    def translate_system_message(
        self, messages: List[AllMessageValues]
    ) -> List[AnthropicSystemMessageContent]:
@ -862,47 +784,3 @@ class AnthropicConfig(BaseConfig):
            message=error_message,
            headers=cast(httpx.Headers, headers),
        )
-
-    def _get_user_anthropic_beta_headers(
-        self, anthropic_beta_header: Optional[str]
-    ) -> Optional[List[str]]:
-        if anthropic_beta_header is None:
-            return None
-        return anthropic_beta_header.split(",")
-
-    def validate_environment(
-        self,
-        headers: dict,
-        model: str,
-        messages: List[AllMessageValues],
-        optional_params: dict,
-        litellm_params: dict,
-        api_key: Optional[str] = None,
-        api_base: Optional[str] = None,
-    ) -> Dict:
-        if api_key is None:
-            raise litellm.AuthenticationError(
-                message="Missing Anthropic API Key - A call is being made to anthropic but no key is set either in the environment variables or via params. Please set `ANTHROPIC_API_KEY` in your environment vars",
-                llm_provider="anthropic",
-                model=model,
-            )
-
-        tools = optional_params.get("tools")
-        prompt_caching_set = self.is_cache_control_set(messages=messages)
-        computer_tool_used = self.is_computer_tool_used(tools=tools)
-        pdf_used = self.is_pdf_used(messages=messages)
-        user_anthropic_beta_headers = self._get_user_anthropic_beta_headers(
-            anthropic_beta_header=headers.get("anthropic-beta")
-        )
-        anthropic_headers = self.get_anthropic_headers(
-            computer_tool_used=computer_tool_used,
-            prompt_caching_set=prompt_caching_set,
-            pdf_used=pdf_used,
-            api_key=api_key,
-            is_vertex_request=optional_params.get("is_vertex_request", False),
-            user_anthropic_beta_headers=user_anthropic_beta_headers,
-        )
-
-        headers = {**headers, **anthropic_headers}
-
-        return headers
--- a/litellm/llms/anthropic/common_utils.py
+++ b/litellm/llms/anthropic/common_utils.py
@ -2,7 +2,7 @@
 This file contains common utils for anthropic calls.
 """

-from typing import List, Optional, Union
+from typing import Dict, List, Optional, Union

 import httpx

@ -10,6 +10,8 @@ import litellm
 from litellm.llms.base_llm.base_utils import BaseLLMModelInfo
 from litellm.llms.base_llm.chat.transformation import BaseLLMException
 from litellm.secret_managers.main import get_secret_str
+from litellm.types.llms.anthropic import AllAnthropicToolsValues
+from litellm.types.llms.openai import AllMessageValues


 class AnthropicError(BaseLLMException):
@ -23,6 +25,128 @@ class AnthropicError(BaseLLMException):


 class AnthropicModelInfo(BaseLLMModelInfo):
+    def is_cache_control_set(self, messages: List[AllMessageValues]) -> bool:
+        """
+        Return if {"cache_control": ..} in message content block
+
+        Used to check if anthropic prompt caching headers need to be set.
+        """
+        for message in messages:
+            if message.get("cache_control", None) is not None:
+                return True
+            _message_content = message.get("content")
+            if _message_content is not None and isinstance(_message_content, list):
+                for content in _message_content:
+                    if "cache_control" in content:
+                        return True
+
+        return False
+
+    def is_computer_tool_used(
+        self, tools: Optional[List[AllAnthropicToolsValues]]
+    ) -> bool:
+        if tools is None:
+            return False
+        for tool in tools:
+            if "type" in tool and tool["type"].startswith("computer_"):
+                return True
+        return False
+
+    def is_pdf_used(self, messages: List[AllMessageValues]) -> bool:
+        """
+        Set to true if media passed into messages.
+
+        """
+        for message in messages:
+            if (
+                "content" in message
+                and message["content"] is not None
+                and isinstance(message["content"], list)
+            ):
+                for content in message["content"]:
+                    if "type" in content and content["type"] != "text":
+                        return True
+        return False
+
+    def _get_user_anthropic_beta_headers(
+        self, anthropic_beta_header: Optional[str]
+    ) -> Optional[List[str]]:
+        if anthropic_beta_header is None:
+            return None
+        return anthropic_beta_header.split(",")
+
+    def get_anthropic_headers(
+        self,
+        api_key: str,
+        anthropic_version: Optional[str] = None,
+        computer_tool_used: bool = False,
+        prompt_caching_set: bool = False,
+        pdf_used: bool = False,
+        is_vertex_request: bool = False,
+        user_anthropic_beta_headers: Optional[List[str]] = None,
+    ) -> dict:
+        betas = set()
+        if prompt_caching_set:
+            betas.add("prompt-caching-2024-07-31")
+        if computer_tool_used:
+            betas.add("computer-use-2024-10-22")
+        if pdf_used:
+            betas.add("pdfs-2024-09-25")
+        headers = {
+            "anthropic-version": anthropic_version or "2023-06-01",
+            "x-api-key": api_key,
+            "accept": "application/json",
+            "content-type": "application/json",
+        }
+
+        if user_anthropic_beta_headers is not None:
+            betas.update(user_anthropic_beta_headers)
+
+        # Don't send any beta headers to Vertex, Vertex has failed requests when they are sent
+        if is_vertex_request is True:
+            pass
+        elif len(betas) > 0:
+            headers["anthropic-beta"] = ",".join(betas)
+
+        return headers
+
+    def validate_environment(
+        self,
+        headers: dict,
+        model: str,
+        messages: List[AllMessageValues],
+        optional_params: dict,
+        litellm_params: dict,
+        api_key: Optional[str] = None,
+        api_base: Optional[str] = None,
+    ) -> Dict:
+        if api_key is None:
+            raise litellm.AuthenticationError(
+                message="Missing Anthropic API Key - A call is being made to anthropic but no key is set either in the environment variables or via params. Please set `ANTHROPIC_API_KEY` in your environment vars",
+                llm_provider="anthropic",
+                model=model,
+            )
+
+        tools = optional_params.get("tools")
+        prompt_caching_set = self.is_cache_control_set(messages=messages)
+        computer_tool_used = self.is_computer_tool_used(tools=tools)
+        pdf_used = self.is_pdf_used(messages=messages)
+        user_anthropic_beta_headers = self._get_user_anthropic_beta_headers(
+            anthropic_beta_header=headers.get("anthropic-beta")
+        )
+        anthropic_headers = self.get_anthropic_headers(
+            computer_tool_used=computer_tool_used,
+            prompt_caching_set=prompt_caching_set,
+            pdf_used=pdf_used,
+            api_key=api_key,
+            is_vertex_request=optional_params.get("is_vertex_request", False),
+            user_anthropic_beta_headers=user_anthropic_beta_headers,
+        )
+
+        headers = {**headers, **anthropic_headers}
+
+        return headers
+
    @staticmethod
    def get_api_base(api_base: Optional[str] = None) -> Optional[str]:
        return (
--- a/litellm/llms/azure/chat/gpt_transformation.py
+++ b/litellm/llms/azure/chat/gpt_transformation.py
@ -125,14 +125,22 @@ class AzureOpenAIConfig(BaseConfig):
    ) -> bool:
        """
        - check if api_version is supported for response_format
+        - returns True if the API version is equal to or newer than the supported version
        """
+        api_year = int(api_version_year)
+        api_month = int(api_version_month)
+        supported_year = int(API_VERSION_YEAR_SUPPORTED_RESPONSE_FORMAT)
+        supported_month = int(API_VERSION_MONTH_SUPPORTED_RESPONSE_FORMAT)

-        is_supported = (
-            int(api_version_year) <= API_VERSION_YEAR_SUPPORTED_RESPONSE_FORMAT
-            and int(api_version_month) >= API_VERSION_MONTH_SUPPORTED_RESPONSE_FORMAT
-        )
-
-        return is_supported
+        # If the year is greater than supported year, it's definitely supported
+        if api_year > supported_year:
+            return True
+        # If the year is less than supported year, it's not supported
+        elif api_year < supported_year:
+            return False
+        # If same year, check if month is >= supported month
+        else:
+            return api_month >= supported_month

    def map_openai_params(
        self,
@ -202,6 +210,7 @@ class AzureOpenAIConfig(BaseConfig):
                    is_response_format_supported_api_version
                    and _is_response_format_supported_model
                )
+
                optional_params = self._add_response_format_to_tools(
                    optional_params=optional_params,
                    value=value,
--- a/litellm/llms/azure/common_utils.py
+++ b/litellm/llms/azure/common_utils.py
@ -309,21 +309,30 @@ class BaseAzureLLM(BaseOpenAILLM):
        azure_ad_token_provider: Optional[Callable[[], str]] = None
        # If we have api_key, then we have higher priority
        azure_ad_token = litellm_params.get("azure_ad_token")
-        tenant_id = litellm_params.get("tenant_id")
-        client_id = litellm_params.get("client_id")
-        client_secret = litellm_params.get("client_secret")
-        azure_username = litellm_params.get("azure_username")
-        azure_password = litellm_params.get("azure_password")
+        tenant_id = litellm_params.get("tenant_id", os.getenv("AZURE_TENANT_ID"))
+        client_id = litellm_params.get("client_id", os.getenv("AZURE_CLIENT_ID"))
+        client_secret = litellm_params.get(
+            "client_secret", os.getenv("AZURE_CLIENT_SECRET")
+        )
+        azure_username = litellm_params.get(
+            "azure_username", os.getenv("AZURE_USERNAME")
+        )
+        azure_password = litellm_params.get(
+            "azure_password", os.getenv("AZURE_PASSWORD")
+        )
        max_retries = litellm_params.get("max_retries")
        timeout = litellm_params.get("timeout")
        if not api_key and tenant_id and client_id and client_secret:
-            verbose_logger.debug("Using Azure AD Token Provider for Azure Auth")
+            verbose_logger.debug(
+                "Using Azure AD Token Provider from Entrata ID for Azure Auth"
+            )
            azure_ad_token_provider = get_azure_ad_token_from_entrata_id(
                tenant_id=tenant_id,
                client_id=client_id,
                client_secret=client_secret,
            )
        if azure_username and azure_password and client_id:
+            verbose_logger.debug("Using Azure Username and Password for Azure Auth")
            azure_ad_token_provider = get_azure_ad_token_from_username_password(
                azure_username=azure_username,
                azure_password=azure_password,
@ -331,12 +340,16 @@ class BaseAzureLLM(BaseOpenAILLM):
            )

        if azure_ad_token is not None and azure_ad_token.startswith("oidc/"):
+            verbose_logger.debug("Using Azure OIDC Token for Azure Auth")
            azure_ad_token = get_azure_ad_token_from_oidc(azure_ad_token)
        elif (
            not api_key
            and azure_ad_token_provider is None
            and litellm.enable_azure_ad_token_refresh is True
        ):
+            verbose_logger.debug(
+                "Using Azure AD token provider based on Service Principal with Secret workflow for Azure Auth"
+            )
            try:
                azure_ad_token_provider = get_azure_ad_token_provider()
            except ValueError:
--- a/litellm/llms/base_llm/base_utils.py
+++ b/litellm/llms/base_llm/base_utils.py
@ -44,6 +44,19 @@ class BaseLLMModelInfo(ABC):
    def get_api_base(api_base: Optional[str] = None) -> Optional[str]:
        pass

+    @abstractmethod
+    def validate_environment(
+        self,
+        headers: dict,
+        model: str,
+        messages: List[AllMessageValues],
+        optional_params: dict,
+        litellm_params: dict,
+        api_key: Optional[str] = None,
+        api_base: Optional[str] = None,
+    ) -> dict:
+        pass
+
    @staticmethod
    @abstractmethod
    def get_base_model(model: str) -> Optional[str]:
--- a/litellm/llms/bedrock/common_utils.py
+++ b/litellm/llms/bedrock/common_utils.py
@ -44,7 +44,18 @@ class AmazonBedrockGlobalConfig:
        )

    def get_ap_regions(self) -> List[str]:
-        return ["ap-northeast-1", "ap-northeast-2", "ap-northeast-3", "ap-south-1"]
+        """
+        Source: https://www.aws-services.info/bedrock.html
+        """
+        return [
+            "ap-northeast-1",  # Asia Pacific (Tokyo)
+            "ap-northeast-2",  # Asia Pacific (Seoul)
+            "ap-northeast-3",  # Asia Pacific (Osaka)
+            "ap-south-1",      # Asia Pacific (Mumbai)
+            "ap-south-2",      # Asia Pacific (Hyderabad)
+            "ap-southeast-1",  # Asia Pacific (Singapore)
+            "ap-southeast-2",  # Asia Pacific (Sydney)
+        ]

    def get_sa_regions(self) -> List[str]:
        return ["sa-east-1"]
@ -54,10 +65,14 @@ class AmazonBedrockGlobalConfig:
        Source: https://www.aws-services.info/bedrock.html
        """
        return [
-            "eu-west-1",
-            "eu-west-2",
-            "eu-west-3",
-            "eu-central-1",
+            "eu-west-1",      # Europe (Ireland)
+            "eu-west-2",      # Europe (London)
+            "eu-west-3",      # Europe (Paris)
+            "eu-central-1",   # Europe (Frankfurt)
+            "eu-central-2",   # Europe (Zurich)
+            "eu-south-1",     # Europe (Milan)
+            "eu-south-2",     # Europe (Spain)
+            "eu-north-1",     # Europe (Stockholm)
        ]

    def get_ca_regions(self) -> List[str]:
@ -68,11 +83,11 @@ class AmazonBedrockGlobalConfig:
        Source: https://www.aws-services.info/bedrock.html
        """
        return [
-            "us-east-2",
-            "us-east-1",
-            "us-west-1",
-            "us-west-2",
-            "us-gov-west-1",
+            "us-east-1",      # US East (N. Virginia)
+            "us-east-2",      # US East (Ohio)
+            "us-west-2",      # US West (Oregon)
+            "us-gov-east-1",  # AWS GovCloud (US-East)
+            "us-gov-west-1",  # AWS GovCloud (US-West)
        ]


--- a/litellm/llms/cohere/chat/v2_transformation.py
+++ b/litellm/llms/cohere/chat/v2_transformation.py
@ -0,0 +1,356 @@
+import time
+from typing import TYPE_CHECKING, Any, AsyncIterator, Iterator, List, Optional, Union
+
+import httpx
+
+import litellm
+from litellm.litellm_core_utils.prompt_templates.factory import cohere_messages_pt_v2
+from litellm.llms.base_llm.chat.transformation import BaseConfig, BaseLLMException
+from litellm.types.llms.cohere import CohereV2ChatResponse
+from litellm.types.llms.openai import AllMessageValues, ChatCompletionToolCallChunk
+from litellm.types.utils import ModelResponse, Usage
+
+from ..common_utils import CohereError
+from ..common_utils import ModelResponseIterator as CohereModelResponseIterator
+from ..common_utils import validate_environment as cohere_validate_environment
+
+if TYPE_CHECKING:
+    from litellm.litellm_core_utils.litellm_logging import Logging as _LiteLLMLoggingObj
+
+    LiteLLMLoggingObj = _LiteLLMLoggingObj
+else:
+    LiteLLMLoggingObj = Any
+
+
+class CohereV2ChatConfig(BaseConfig):
+    """
+    Configuration class for Cohere's API interface.
+
+    Args:
+        preamble (str, optional): When specified, the default Cohere preamble will be replaced with the provided one.
+        chat_history (List[Dict[str, str]], optional): A list of previous messages between the user and the model.
+        generation_id (str, optional): Unique identifier for the generated reply.
+        response_id (str, optional): Unique identifier for the response.
+        conversation_id (str, optional): An alternative to chat_history, creates or resumes a persisted conversation.
+        prompt_truncation (str, optional): Dictates how the prompt will be constructed. Options: 'AUTO', 'AUTO_PRESERVE_ORDER', 'OFF'.
+        connectors (List[Dict[str, str]], optional): List of connectors (e.g., web-search) to enrich the model's reply.
+        search_queries_only (bool, optional): When true, the response will only contain a list of generated search queries.
+        documents (List[Dict[str, str]], optional): A list of relevant documents that the model can cite.
+        temperature (float, optional): A non-negative float that tunes the degree of randomness in generation.
+        max_tokens (int, optional): The maximum number of tokens the model will generate as part of the response.
+        k (int, optional): Ensures only the top k most likely tokens are considered for generation at each step.
+        p (float, optional): Ensures that only the most likely tokens, with total probability mass of p, are considered for generation.
+        frequency_penalty (float, optional): Used to reduce repetitiveness of generated tokens.
+        presence_penalty (float, optional): Used to reduce repetitiveness of generated tokens.
+        tools (List[Dict[str, str]], optional): A list of available tools (functions) that the model may suggest invoking.
+        tool_results (List[Dict[str, Any]], optional): A list of results from invoking tools.
+        seed (int, optional): A seed to assist reproducibility of the model's response.
+    """
+
+    preamble: Optional[str] = None
+    chat_history: Optional[list] = None
+    generation_id: Optional[str] = None
+    response_id: Optional[str] = None
+    conversation_id: Optional[str] = None
+    prompt_truncation: Optional[str] = None
+    connectors: Optional[list] = None
+    search_queries_only: Optional[bool] = None
+    documents: Optional[list] = None
+    temperature: Optional[int] = None
+    max_tokens: Optional[int] = None
+    k: Optional[int] = None
+    p: Optional[int] = None
+    frequency_penalty: Optional[int] = None
+    presence_penalty: Optional[int] = None
+    tools: Optional[list] = None
+    tool_results: Optional[list] = None
+    seed: Optional[int] = None
+
+    def __init__(
+        self,
+        preamble: Optional[str] = None,
+        chat_history: Optional[list] = None,
+        generation_id: Optional[str] = None,
+        response_id: Optional[str] = None,
+        conversation_id: Optional[str] = None,
+        prompt_truncation: Optional[str] = None,
+        connectors: Optional[list] = None,
+        search_queries_only: Optional[bool] = None,
+        documents: Optional[list] = None,
+        temperature: Optional[int] = None,
+        max_tokens: Optional[int] = None,
+        k: Optional[int] = None,
+        p: Optional[int] = None,
+        frequency_penalty: Optional[int] = None,
+        presence_penalty: Optional[int] = None,
+        tools: Optional[list] = None,
+        tool_results: Optional[list] = None,
+        seed: Optional[int] = None,
+    ) -> None:
+        locals_ = locals()
+        for key, value in locals_.items():
+            if key != "self" and value is not None:
+                setattr(self.__class__, key, value)
+
+    def validate_environment(
+        self,
+        headers: dict,
+        model: str,
+        messages: List[AllMessageValues],
+        optional_params: dict,
+        litellm_params: dict,
+        api_key: Optional[str] = None,
+        api_base: Optional[str] = None,
+    ) -> dict:
+        return cohere_validate_environment(
+            headers=headers,
+            model=model,
+            messages=messages,
+            optional_params=optional_params,
+            api_key=api_key,
+        )
+
+    def get_supported_openai_params(self, model: str) -> List[str]:
+        return [
+            "stream",
+            "temperature",
+            "max_tokens",
+            "top_p",
+            "frequency_penalty",
+            "presence_penalty",
+            "stop",
+            "n",
+            "tools",
+            "tool_choice",
+            "seed",
+            "extra_headers",
+        ]
+
+    def map_openai_params(
+        self,
+        non_default_params: dict,
+        optional_params: dict,
+        model: str,
+        drop_params: bool,
+    ) -> dict:
+        for param, value in non_default_params.items():
+            if param == "stream":
+                optional_params["stream"] = value
+            if param == "temperature":
+                optional_params["temperature"] = value
+            if param == "max_tokens":
+                optional_params["max_tokens"] = value
+            if param == "n":
+                optional_params["num_generations"] = value
+            if param == "top_p":
+                optional_params["p"] = value
+            if param == "frequency_penalty":
+                optional_params["frequency_penalty"] = value
+            if param == "presence_penalty":
+                optional_params["presence_penalty"] = value
+            if param == "stop":
+                optional_params["stop_sequences"] = value
+            if param == "tools":
+                optional_params["tools"] = value
+            if param == "seed":
+                optional_params["seed"] = value
+        return optional_params
+
+    def transform_request(
+        self,
+        model: str,
+        messages: List[AllMessageValues],
+        optional_params: dict,
+        litellm_params: dict,
+        headers: dict,
+    ) -> dict:
+        ## Load Config
+        for k, v in litellm.CohereChatConfig.get_config().items():
+            if (
+                k not in optional_params
+            ):  # completion(top_k=3) > cohere_config(top_k=3) <- allows for dynamic variables to be passed in
+                optional_params[k] = v
+
+        most_recent_message, chat_history = cohere_messages_pt_v2(
+            messages=messages, model=model, llm_provider="cohere_chat"
+        )
+
+        ## Handle Tool Calling
+        if "tools" in optional_params:
+            _is_function_call = True
+            cohere_tools = self._construct_cohere_tool(tools=optional_params["tools"])
+            optional_params["tools"] = cohere_tools
+        if isinstance(most_recent_message, dict):
+            optional_params["tool_results"] = [most_recent_message]
+        elif isinstance(most_recent_message, str):
+            optional_params["message"] = most_recent_message
+
+        ## check if chat history message is 'user' and 'tool_results' is given -> force_single_step=True, else cohere api fails
+        if len(chat_history) > 0 and chat_history[-1]["role"] == "USER":
+            optional_params["force_single_step"] = True
+
+        return optional_params
+
+    def transform_response(
+        self,
+        model: str,
+        raw_response: httpx.Response,
+        model_response: ModelResponse,
+        logging_obj: LiteLLMLoggingObj,
+        request_data: dict,
+        messages: List[AllMessageValues],
+        optional_params: dict,
+        litellm_params: dict,
+        encoding: Any,
+        api_key: Optional[str] = None,
+        json_mode: Optional[bool] = None,
+    ) -> ModelResponse:
+        try:
+            raw_response_json = raw_response.json()
+        except Exception:
+            raise CohereError(
+                message=raw_response.text, status_code=raw_response.status_code
+            )
+
+        try:
+            cohere_v2_chat_response = CohereV2ChatResponse(**raw_response_json)  # type: ignore
+        except Exception:
+            raise CohereError(message=raw_response.text, status_code=422)
+
+        cohere_content = cohere_v2_chat_response["message"].get("content", None)
+        if cohere_content is not None:
+            model_response.choices[0].message.content = "".join(  # type: ignore
+                [
+                    content.get("text", "")
+                    for content in cohere_content
+                    if content is not None
+                ]
+            )
+
+        ## ADD CITATIONS
+        if "citations" in cohere_v2_chat_response:
+            setattr(model_response, "citations", cohere_v2_chat_response["citations"])
+
+        ## Tool calling response
+        cohere_tools_response = cohere_v2_chat_response["message"].get("tool_calls", [])
+        if cohere_tools_response is not None and cohere_tools_response != []:
+            # convert cohere_tools_response to OpenAI response format
+            tool_calls: List[ChatCompletionToolCallChunk] = []
+            for index, tool in enumerate(cohere_tools_response):
+                tool_call: ChatCompletionToolCallChunk = {
+                    **tool,  # type: ignore
+                    "index": index,
+                }
+                tool_calls.append(tool_call)
+            _message = litellm.Message(
+                tool_calls=tool_calls,
+                content=None,
+            )
+            model_response.choices[0].message = _message  # type: ignore
+
+        ## CALCULATING USAGE - use cohere `billed_units` for returning usage
+        token_usage = cohere_v2_chat_response["usage"].get("tokens", {})
+        prompt_tokens = token_usage.get("input_tokens", 0)
+        completion_tokens = token_usage.get("output_tokens", 0)
+
+        model_response.created = int(time.time())
+        model_response.model = model
+        usage = Usage(
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=prompt_tokens + completion_tokens,
+        )
+        setattr(model_response, "usage", usage)
+        return model_response
+
+    def _construct_cohere_tool(
+        self,
+        tools: Optional[list] = None,
+    ):
+        if tools is None:
+            tools = []
+        cohere_tools = []
+        for tool in tools:
+            cohere_tool = self._translate_openai_tool_to_cohere(tool)
+            cohere_tools.append(cohere_tool)
+        return cohere_tools
+
+    def _translate_openai_tool_to_cohere(
+        self,
+        openai_tool: dict,
+    ):
+        # cohere tools look like this
+        """
+        {
+        "name": "query_daily_sales_report",
+        "description": "Connects to a database to retrieve overall sales volumes and sales information for a given day.",
+        "parameter_definitions": {
+            "day": {
+                "description": "Retrieves sales data for this day, formatted as YYYY-MM-DD.",
+                "type": "str",
+                "required": True
+            }
+        }
+        }
+        """
+
+        # OpenAI tools look like this
+        """
+        {
+            "type": "function",
+            "function": {
+                "name": "get_current_weather",
+                "description": "Get the current weather in a given location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "The city and state, e.g. San Francisco, CA",
+                        },
+                        "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+                    },
+                    "required": ["location"],
+                },
+            },
+        }
+        """
+        cohere_tool = {
+            "name": openai_tool["function"]["name"],
+            "description": openai_tool["function"]["description"],
+            "parameter_definitions": {},
+        }
+
+        for param_name, param_def in openai_tool["function"]["parameters"][
+            "properties"
+        ].items():
+            required_params = (
+                openai_tool.get("function", {})
+                .get("parameters", {})
+                .get("required", [])
+            )
+            cohere_param_def = {
+                "description": param_def.get("description", ""),
+                "type": param_def.get("type", ""),
+                "required": param_name in required_params,
+            }
+            cohere_tool["parameter_definitions"][param_name] = cohere_param_def
+
+        return cohere_tool
+
+    def get_model_response_iterator(
+        self,
+        streaming_response: Union[Iterator[str], AsyncIterator[str], ModelResponse],
+        sync_stream: bool,
+        json_mode: Optional[bool] = False,
+    ):
+        return CohereModelResponseIterator(
+            streaming_response=streaming_response,
+            sync_stream=sync_stream,
+            json_mode=json_mode,
+        )
+
+    def get_error_class(
+        self, error_message: str, status_code: int, headers: Union[dict, httpx.Headers]
+    ) -> BaseLLMException:
+        return CohereError(status_code=status_code, message=error_message)
--- a/litellm/llms/cohere/common_utils.py
+++ b/litellm/llms/cohere/common_utils.py
@ -104,19 +104,28 @@ class ModelResponseIterator:
            raise RuntimeError(f"Error receiving chunk from stream: {e}")

        try:
-            str_line = chunk
-            if isinstance(chunk, bytes):  # Handle binary data
-                str_line = chunk.decode("utf-8")  # Convert bytes to string
-                index = str_line.find("data:")
-                if index != -1:
-                    str_line = str_line[index:]
-            data_json = json.loads(str_line)
-            return self.chunk_parser(chunk=data_json)
+            return self.convert_str_chunk_to_generic_chunk(chunk=chunk)
        except StopIteration:
            raise StopIteration
        except ValueError as e:
            raise RuntimeError(f"Error parsing chunk: {e},\nReceived chunk: {chunk}")

+    def convert_str_chunk_to_generic_chunk(self, chunk: str) -> GenericStreamingChunk:
+        """
+        Convert a string chunk to a GenericStreamingChunk
+
+        Note: This is used for Cohere pass through streaming logging
+        """
+        str_line = chunk
+        if isinstance(chunk, bytes):  # Handle binary data
+            str_line = chunk.decode("utf-8")  # Convert bytes to string
+            index = str_line.find("data:")
+            if index != -1:
+                str_line = str_line[index:]
+
+        data_json = json.loads(str_line)
+        return self.chunk_parser(chunk=data_json)
+
    # Async iterator
    def __aiter__(self):
        self.async_response_iterator = self.streaming_response.__aiter__()
@ -131,15 +140,7 @@ class ModelResponseIterator:
            raise RuntimeError(f"Error receiving chunk from stream: {e}")

        try:
-            str_line = chunk
-            if isinstance(chunk, bytes):  # Handle binary data
-                str_line = chunk.decode("utf-8")  # Convert bytes to string
-                index = str_line.find("data:")
-                if index != -1:
-                    str_line = str_line[index:]
-
-            data_json = json.loads(str_line)
-            return self.chunk_parser(chunk=data_json)
+            return self.convert_str_chunk_to_generic_chunk(chunk=chunk)
        except StopAsyncIteration:
            raise StopAsyncIteration
        except ValueError as e:
--- a/litellm/llms/openai/chat/gpt_transformation.py
+++ b/litellm/llms/openai/chat/gpt_transformation.py
@ -389,7 +389,7 @@ class OpenAIGPTConfig(BaseLLMModelInfo, BaseConfig):
        )

    @staticmethod
-    def get_base_model(model: str) -> str:
+    def get_base_model(model: Optional[str] = None) -> Optional[str]:
        return model

    def get_model_response_iterator(
--- a/litellm/llms/topaz/common_utils.py
+++ b/litellm/llms/topaz/common_utils.py
@ -1,6 +1,7 @@
 from typing import List, Optional

 from litellm.secret_managers.main import get_secret_str
+from litellm.types.llms.openai import AllMessageValues

 from ..base_llm.base_utils import BaseLLMModelInfo
 from ..base_llm.chat.transformation import BaseLLMException
@ -11,6 +12,26 @@ class TopazException(BaseLLMException):


 class TopazModelInfo(BaseLLMModelInfo):
+    def validate_environment(
+        self,
+        headers: dict,
+        model: str,
+        messages: List[AllMessageValues],
+        optional_params: dict,
+        litellm_params: dict,
+        api_key: Optional[str] = None,
+        api_base: Optional[str] = None,
+    ) -> dict:
+        if api_key is None:
+            raise ValueError(
+                "API key is required for Topaz image variations. Set via `TOPAZ_API_KEY` or `api_key=..`"
+            )
+        return {
+            # "Content-Type": "multipart/form-data",
+            "Accept": "image/jpeg",
+            "X-API-Key": api_key,
+        }
+
    def get_models(
        self, api_key: Optional[str] = None, api_base: Optional[str] = None
    ) -> List[str]:
--- a/litellm/llms/topaz/image_variations/transformation.py
+++ b/litellm/llms/topaz/image_variations/transformation.py
@ -10,10 +10,7 @@ from litellm.llms.base_llm.chat.transformation import (
    BaseLLMException,
    LiteLLMLoggingObj,
 )
-from litellm.types.llms.openai import (
-    AllMessageValues,
-    OpenAIImageVariationOptionalParams,
-)
+from litellm.types.llms.openai import OpenAIImageVariationOptionalParams
 from litellm.types.utils import (
    FileTypes,
    HttpHandlerRequestFields,
@ -22,35 +19,15 @@ from litellm.types.utils import (
 )

 from ...base_llm.image_variations.transformation import BaseImageVariationConfig
-from ..common_utils import TopazException
+from ..common_utils import TopazException, TopazModelInfo


-class TopazImageVariationConfig(BaseImageVariationConfig):
+class TopazImageVariationConfig(TopazModelInfo, BaseImageVariationConfig):
    def get_supported_openai_params(
        self, model: str
    ) -> List[OpenAIImageVariationOptionalParams]:
        return ["response_format", "size"]

-    def validate_environment(
-        self,
-        headers: dict,
-        model: str,
-        messages: List[AllMessageValues],
-        optional_params: dict,
-        litellm_params: dict,
-        api_key: Optional[str] = None,
-        api_base: Optional[str] = None,
-    ) -> dict:
-        if api_key is None:
-            raise ValueError(
-                "API key is required for Topaz image variations. Set via `TOPAZ_API_KEY` or `api_key=..`"
-            )
-        return {
-            # "Content-Type": "multipart/form-data",
-            "Accept": "image/jpeg",
-            "X-API-Key": api_key,
-        }
-
    def get_complete_url(
        self,
        api_base: Optional[str],
--- a/litellm/llms/vllm/common_utils.py
+++ b/litellm/llms/vllm/common_utils.py
@ -0,0 +1,75 @@
+from typing import List, Optional, Union
+
+import httpx
+
+import litellm
+from litellm.llms.base_llm.base_utils import BaseLLMModelInfo
+from litellm.llms.base_llm.chat.transformation import BaseLLMException
+from litellm.secret_managers.main import get_secret_str
+from litellm.types.llms.openai import AllMessageValues
+from litellm.utils import _add_path_to_api_base
+
+
+class VLLMError(BaseLLMException):
+    pass
+
+
+class VLLMModelInfo(BaseLLMModelInfo):
+    def validate_environment(
+        self,
+        headers: dict,
+        model: str,
+        messages: List[AllMessageValues],
+        optional_params: dict,
+        litellm_params: dict,
+        api_key: Optional[str] = None,
+        api_base: Optional[str] = None,
+    ) -> dict:
+        """Google AI Studio sends api key in query params"""
+        return headers
+
+    @staticmethod
+    def get_api_base(api_base: Optional[str] = None) -> Optional[str]:
+        api_base = api_base or get_secret_str("VLLM_API_BASE")
+        if api_base is None:
+            raise ValueError(
+                "VLLM_API_BASE is not set. Please set the environment variable, to use VLLM's pass-through - `{LITELLM_API_BASE}/vllm/{endpoint}`."
+            )
+        return api_base
+
+    @staticmethod
+    def get_api_key(api_key: Optional[str] = None) -> Optional[str]:
+        return None
+
+    @staticmethod
+    def get_base_model(model: str) -> Optional[str]:
+        return model
+
+    def get_models(
+        self, api_key: Optional[str] = None, api_base: Optional[str] = None
+    ) -> List[str]:
+        api_base = VLLMModelInfo.get_api_base(api_base)
+        api_key = VLLMModelInfo.get_api_key(api_key)
+        endpoint = "/v1/models"
+        if api_base is None or api_key is None:
+            raise ValueError(
+                "GEMINI_API_BASE or GEMINI_API_KEY is not set. Please set the environment variable, to query Gemini's `/models` endpoint."
+            )
+
+        url = _add_path_to_api_base(api_base, endpoint)
+        response = litellm.module_level_client.get(
+            url=url,
+        )
+
+        response.raise_for_status()
+
+        models = response.json()["data"]
+
+        return [model["id"] for model in models]
+
+    def get_error_class(
+        self, error_message: str, status_code: int, headers: Union[dict, httpx.Headers]
+    ) -> BaseLLMException:
+        return VLLMError(
+            status_code=status_code, message=error_message, headers=headers
+        )
--- a/litellm/llms/xai/common_utils.py
+++ b/litellm/llms/xai/common_utils.py
@ -5,9 +5,29 @@ import httpx
 import litellm
 from litellm.llms.base_llm.base_utils import BaseLLMModelInfo
 from litellm.secret_managers.main import get_secret_str
+from litellm.types.llms.openai import AllMessageValues


 class XAIModelInfo(BaseLLMModelInfo):
+    def validate_environment(
+        self,
+        headers: dict,
+        model: str,
+        messages: List[AllMessageValues],
+        optional_params: dict,
+        litellm_params: dict,
+        api_key: Optional[str] = None,
+        api_base: Optional[str] = None,
+    ) -> dict:
+        if api_key is not None:
+            headers["Authorization"] = f"Bearer {api_key}"
+
+        # Ensure Content-Type is set to application/json
+        if "content-type" not in headers and "Content-Type" not in headers:
+            headers["Content-Type"] = "application/json"
+
+        return headers
+
    @staticmethod
    def get_api_base(api_base: Optional[str] = None) -> Optional[str]:
        return api_base or get_secret_str("XAI_API_BASE") or "https://api.x.ai"
--- a/litellm/main.py
+++ b/litellm/main.py
@ -954,7 +954,11 @@ def completion(  # type: ignore # noqa: PLR0915
    non_default_params = get_non_default_completion_params(kwargs=kwargs)
    litellm_params = {}  # used to prevent unbound var errors
    ## PROMPT MANAGEMENT HOOKS ##
-    if isinstance(litellm_logging_obj, LiteLLMLoggingObj) and prompt_id is not None:
+    if isinstance(litellm_logging_obj, LiteLLMLoggingObj) and (
+        litellm_logging_obj.should_run_prompt_management_hooks(
+            prompt_id=prompt_id, non_default_params=non_default_params
+        )
+    ):
        (
            model,
            messages,
@ -2654,9 +2658,9 @@ def completion(  # type: ignore # noqa: PLR0915
                    "aws_region_name" not in optional_params
                    or optional_params["aws_region_name"] is None
                ):
-                    optional_params[
-                        "aws_region_name"
-                    ] = aws_bedrock_client.meta.region_name
+                    optional_params["aws_region_name"] = (
+                        aws_bedrock_client.meta.region_name
+                    )

            bedrock_route = BedrockModelInfo.get_bedrock_route(model)
            if bedrock_route == "converse":
@ -4363,9 +4367,9 @@ def adapter_completion(
    new_kwargs = translation_obj.translate_completion_input_params(kwargs=kwargs)

    response: Union[ModelResponse, CustomStreamWrapper] = completion(**new_kwargs)  # type: ignore
-    translated_response: Optional[
-        Union[BaseModel, AdapterCompletionStreamWrapper]
-    ] = None
+    translated_response: Optional[Union[BaseModel, AdapterCompletionStreamWrapper]] = (
+        None
+    )
    if isinstance(response, ModelResponse):
        translated_response = translation_obj.translate_completion_output_params(
            response=response
@ -5785,9 +5789,9 @@ def stream_chunk_builder(  # noqa: PLR0915
        ]

        if len(content_chunks) > 0:
-            response["choices"][0]["message"][
-                "content"
-            ] = processor.get_combined_content(content_chunks)
+            response["choices"][0]["message"]["content"] = (
+                processor.get_combined_content(content_chunks)
+            )

        reasoning_chunks = [
            chunk
@ -5798,9 +5802,9 @@ def stream_chunk_builder(  # noqa: PLR0915
        ]

        if len(reasoning_chunks) > 0:
-            response["choices"][0]["message"][
-                "reasoning_content"
-            ] = processor.get_combined_reasoning_content(reasoning_chunks)
+            response["choices"][0]["message"]["reasoning_content"] = (
+                processor.get_combined_reasoning_content(reasoning_chunks)
+            )

        audio_chunks = [
            chunk
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -2669,7 +2669,7 @@
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.00000008,
        "input_cost_per_audio_token": 0.000004,
-        "output_cost_per_token": 0.00032,
+        "output_cost_per_token": 0.00000032,
        "litellm_provider": "azure_ai",
        "mode": "chat",
        "supports_audio_input": true,
--- a/litellm/proxy/_experimental/out/_next/static/chunks/250-ef859ae76fcc7788.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/250-ef859ae76fcc7788.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/page-0f22a8aad3780430.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/page-0f22a8aad3780430.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/page-0f39fb1d0bde3c55.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/page-0f39fb1d0bde3c55.js
--- a/litellm/proxy/_experimental/out/_next/static/cxKWUnb1loD53gKRUqiFm/_buildManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/cxKWUnb1loD53gKRUqiFm/_buildManifest.js
--- a/litellm/proxy/_experimental/out/_next/static/cxKWUnb1loD53gKRUqiFm/_ssgManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/cxKWUnb1loD53gKRUqiFm/_ssgManifest.js
--- a/litellm/proxy/_experimental/out/index.html
+++ b/litellm/proxy/_experimental/out/index.html
@ -1 +1 @@
-<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-75a5453f51d60261.js"/><script src="/ui/_next/static/chunks/fd9d1056-524b80e1a6b8bb06.js" async=""></script><script src="/ui/_next/static/chunks/117-87ec698bfca6820e.js" async=""></script><script src="/ui/_next/static/chunks/main-app-475d6efe4080647d.js" async=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-42372ed130431b0a.js" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-75a5453f51d60261.js" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/a34f9d1faa5f3315-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/86f6cc749f6b8493.css\",\"style\"]\n3:HL[\"/ui/_next/static/css/61b39adb1cb75213.css\",\"style\"]\n"])</script><script>self.__next_f.push([1,"4:I[12846,[],\"\"]\n6:I[19107,[],\"ClientPageRoot\"]\n7:I[98621,[\"665\",\"static/chunks/3014691f-0b72c78cfebbd712.js\",\"990\",\"static/chunks/13b76428-ebdf3012af0e4489.js\",\"42\",\"static/chunks/42-6810261f4d6c8bbf.js\",\"261\",\"static/chunks/261-d4b99bc9f53d4ef3.js\",\"899\",\"static/chunks/899-8d832fe7c09b2afe.js\",\"817\",\"static/chunks/817-c8c94d51d28af935.js\",\"250\",\"static/chunks/250-f21e8c1de1717077.js\",\"699\",\"static/chunks/699-b82671c2a3a100c7.js\",\"931\",\"static/chunks/app/page-0f39fb1d0bde3c55.js\"],\"default\",1]\n8:I[4707,[],\"\"]\n9:I[36423,[],\"\"]\nb:I[61060,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"0:[\"$\",\"$L4\",null,{\"buildId\":\"eKo9wJb6ZFjAy85GaOeQ6\",\"assetPrefix\":\"/ui\",\"urlParts\":[\"\",\"\"],\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[[\"$L5\",[\"$\",\"$L6\",null,{\"props\":{\"params\":{},\"searchParams\":{}},\"Component\":\"$7\"}],null],null],null]},[[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/86f6cc749f6b8493.css\",\"precedence\":\"next\",\"crossOrigin\":\"$undefined\"}],[\"$\",\"link\",\"1\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/61b39adb1cb75213.css\",\"precedence\":\"next\",\"crossOrigin\":\"$undefined\"}]],[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_cf7686\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[]}]}]}]],null],null],\"couldBeIntercepted\":false,\"initialHead\":[null,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script></body></html>
+<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-75a5453f51d60261.js"/><script src="/ui/_next/static/chunks/fd9d1056-524b80e1a6b8bb06.js" async=""></script><script src="/ui/_next/static/chunks/117-87ec698bfca6820e.js" async=""></script><script src="/ui/_next/static/chunks/main-app-475d6efe4080647d.js" async=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-42372ed130431b0a.js" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-75a5453f51d60261.js" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/a34f9d1faa5f3315-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/86f6cc749f6b8493.css\",\"style\"]\n3:HL[\"/ui/_next/static/css/61b39adb1cb75213.css\",\"style\"]\n"])</script><script>self.__next_f.push([1,"4:I[12846,[],\"\"]\n6:I[19107,[],\"ClientPageRoot\"]\n7:I[37185,[\"665\",\"static/chunks/3014691f-0b72c78cfebbd712.js\",\"990\",\"static/chunks/13b76428-ebdf3012af0e4489.js\",\"42\",\"static/chunks/42-6810261f4d6c8bbf.js\",\"261\",\"static/chunks/261-d4b99bc9f53d4ef3.js\",\"899\",\"static/chunks/899-8d832fe7c09b2afe.js\",\"817\",\"static/chunks/817-c8c94d51d28af935.js\",\"250\",\"static/chunks/250-ef859ae76fcc7788.js\",\"699\",\"static/chunks/699-b82671c2a3a100c7.js\",\"931\",\"static/chunks/app/page-0f22a8aad3780430.js\"],\"default\",1]\n8:I[4707,[],\"\"]\n9:I[36423,[],\"\"]\nb:I[61060,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"0:[\"$\",\"$L4\",null,{\"buildId\":\"cxKWUnb1loD53gKRUqiFm\",\"assetPrefix\":\"/ui\",\"urlParts\":[\"\",\"\"],\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[[\"$L5\",[\"$\",\"$L6\",null,{\"props\":{\"params\":{},\"searchParams\":{}},\"Component\":\"$7\"}],null],null],null]},[[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/86f6cc749f6b8493.css\",\"precedence\":\"next\",\"crossOrigin\":\"$undefined\"}],[\"$\",\"link\",\"1\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/61b39adb1cb75213.css\",\"precedence\":\"next\",\"crossOrigin\":\"$undefined\"}]],[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_cf7686\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[]}]}]}]],null],null],\"couldBeIntercepted\":false,\"initialHead\":[null,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script></body></html>
--- a/litellm/proxy/_experimental/out/index.txt
+++ b/litellm/proxy/_experimental/out/index.txt
@ -1,7 +1,7 @@
 2:I[19107,[],"ClientPageRoot"]
-3:I[98621,["665","static/chunks/3014691f-0b72c78cfebbd712.js","990","static/chunks/13b76428-ebdf3012af0e4489.js","42","static/chunks/42-6810261f4d6c8bbf.js","261","static/chunks/261-d4b99bc9f53d4ef3.js","899","static/chunks/899-8d832fe7c09b2afe.js","817","static/chunks/817-c8c94d51d28af935.js","250","static/chunks/250-f21e8c1de1717077.js","699","static/chunks/699-b82671c2a3a100c7.js","931","static/chunks/app/page-0f39fb1d0bde3c55.js"],"default",1]
+3:I[37185,["665","static/chunks/3014691f-0b72c78cfebbd712.js","990","static/chunks/13b76428-ebdf3012af0e4489.js","42","static/chunks/42-6810261f4d6c8bbf.js","261","static/chunks/261-d4b99bc9f53d4ef3.js","899","static/chunks/899-8d832fe7c09b2afe.js","817","static/chunks/817-c8c94d51d28af935.js","250","static/chunks/250-ef859ae76fcc7788.js","699","static/chunks/699-b82671c2a3a100c7.js","931","static/chunks/app/page-0f22a8aad3780430.js"],"default",1]
 4:I[4707,[],""]
 5:I[36423,[],""]
-0:["eKo9wJb6ZFjAy85GaOeQ6",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},[["$L1",["$","$L2",null,{"props":{"params":{},"searchParams":{}},"Component":"$3"}],null],null],null]},[[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/86f6cc749f6b8493.css","precedence":"next","crossOrigin":"$undefined"}],["$","link","1",{"rel":"stylesheet","href":"/ui/_next/static/css/61b39adb1cb75213.css","precedence":"next","crossOrigin":"$undefined"}]],["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_cf7686","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[]}]}]}]],null],null],["$L6",null]]]]
+0:["cxKWUnb1loD53gKRUqiFm",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},[["$L1",["$","$L2",null,{"props":{"params":{},"searchParams":{}},"Component":"$3"}],null],null],null]},[[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/86f6cc749f6b8493.css","precedence":"next","crossOrigin":"$undefined"}],["$","link","1",{"rel":"stylesheet","href":"/ui/_next/static/css/61b39adb1cb75213.css","precedence":"next","crossOrigin":"$undefined"}]],["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_cf7686","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[]}]}]}]],null],null],["$L6",null]]]]
 6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
 1:null
--- a/litellm/proxy/_experimental/out/model_hub.txt
+++ b/litellm/proxy/_experimental/out/model_hub.txt
@ -1,7 +1,7 @@
 2:I[19107,[],"ClientPageRoot"]
-3:I[52829,["42","static/chunks/42-6810261f4d6c8bbf.js","261","static/chunks/261-d4b99bc9f53d4ef3.js","250","static/chunks/250-f21e8c1de1717077.js","699","static/chunks/699-b82671c2a3a100c7.js","418","static/chunks/app/model_hub/page-068a441595bd0fc3.js"],"default",1]
+3:I[52829,["42","static/chunks/42-6810261f4d6c8bbf.js","261","static/chunks/261-d4b99bc9f53d4ef3.js","250","static/chunks/250-ef859ae76fcc7788.js","699","static/chunks/699-b82671c2a3a100c7.js","418","static/chunks/app/model_hub/page-068a441595bd0fc3.js"],"default",1]
 4:I[4707,[],""]
 5:I[36423,[],""]
-0:["eKo9wJb6ZFjAy85GaOeQ6",[[["",{"children":["model_hub",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["model_hub",{"children":["__PAGE__",{},[["$L1",["$","$L2",null,{"props":{"params":{},"searchParams":{}},"Component":"$3"}],null],null],null]},[null,["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","model_hub","children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined"}]],null]},[[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/86f6cc749f6b8493.css","precedence":"next","crossOrigin":"$undefined"}],["$","link","1",{"rel":"stylesheet","href":"/ui/_next/static/css/61b39adb1cb75213.css","precedence":"next","crossOrigin":"$undefined"}]],["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_cf7686","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[]}]}]}]],null],null],["$L6",null]]]]
+0:["cxKWUnb1loD53gKRUqiFm",[[["",{"children":["model_hub",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["model_hub",{"children":["__PAGE__",{},[["$L1",["$","$L2",null,{"props":{"params":{},"searchParams":{}},"Component":"$3"}],null],null],null]},[null,["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","model_hub","children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined"}]],null]},[[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/86f6cc749f6b8493.css","precedence":"next","crossOrigin":"$undefined"}],["$","link","1",{"rel":"stylesheet","href":"/ui/_next/static/css/61b39adb1cb75213.css","precedence":"next","crossOrigin":"$undefined"}]],["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_cf7686","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[]}]}]}]],null],null],["$L6",null]]]]
 6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
 1:null
--- a/litellm/proxy/_experimental/out/onboarding.html
+++ b/litellm/proxy/_experimental/out/onboarding.html
--- a/litellm/proxy/_experimental/out/onboarding.txt
+++ b/litellm/proxy/_experimental/out/onboarding.txt
@ -1,7 +1,7 @@
 2:I[19107,[],"ClientPageRoot"]
-3:I[12011,["665","static/chunks/3014691f-0b72c78cfebbd712.js","42","static/chunks/42-6810261f4d6c8bbf.js","899","static/chunks/899-8d832fe7c09b2afe.js","250","static/chunks/250-f21e8c1de1717077.js","461","static/chunks/app/onboarding/page-957334f0c5d5df41.js"],"default",1]
+3:I[12011,["665","static/chunks/3014691f-0b72c78cfebbd712.js","42","static/chunks/42-6810261f4d6c8bbf.js","899","static/chunks/899-8d832fe7c09b2afe.js","250","static/chunks/250-ef859ae76fcc7788.js","461","static/chunks/app/onboarding/page-957334f0c5d5df41.js"],"default",1]
 4:I[4707,[],""]
 5:I[36423,[],""]
-0:["eKo9wJb6ZFjAy85GaOeQ6",[[["",{"children":["onboarding",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["onboarding",{"children":["__PAGE__",{},[["$L1",["$","$L2",null,{"props":{"params":{},"searchParams":{}},"Component":"$3"}],null],null],null]},[null,["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","onboarding","children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined"}]],null]},[[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/86f6cc749f6b8493.css","precedence":"next","crossOrigin":"$undefined"}],["$","link","1",{"rel":"stylesheet","href":"/ui/_next/static/css/61b39adb1cb75213.css","precedence":"next","crossOrigin":"$undefined"}]],["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_cf7686","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[]}]}]}]],null],null],["$L6",null]]]]
+0:["cxKWUnb1loD53gKRUqiFm",[[["",{"children":["onboarding",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["onboarding",{"children":["__PAGE__",{},[["$L1",["$","$L2",null,{"props":{"params":{},"searchParams":{}},"Component":"$3"}],null],null],null]},[null,["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","onboarding","children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined"}]],null]},[[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/86f6cc749f6b8493.css","precedence":"next","crossOrigin":"$undefined"}],["$","link","1",{"rel":"stylesheet","href":"/ui/_next/static/css/61b39adb1cb75213.css","precedence":"next","crossOrigin":"$undefined"}]],["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_cf7686","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[]}]}]}]],null],null],["$L6",null]]]]
 6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
 1:null
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@ -33,6 +33,7 @@ model_list:
 litellm_settings:
  num_retries: 0
  callbacks: ["prometheus"]
+  check_provider_endpoint: true

 files_settings:
  - custom_llm_provider: gemini
--- a/litellm/proxy/_types.py
+++ b/litellm/proxy/_types.py
@ -317,6 +317,8 @@ class LiteLLMRoutes(enum.Enum):
        "/openai",
        "/assemblyai",
        "/eu.assemblyai",
+        "/vllm",
+        "/mistral",
    ]

    anthropic_routes = [
--- a/litellm/proxy/auth/model_checks.py
+++ b/litellm/proxy/auth/model_checks.py
@ -1,11 +1,12 @@
 # What is this?
 ## Common checks for /v1/models and `/model/info`
-import copy
 from typing import Dict, List, Optional, Set

 import litellm
 from litellm._logging import verbose_proxy_logger
 from litellm.proxy._types import SpecialModelNames, UserAPIKeyAuth
+from litellm.router import Router
+from litellm.types.router import LiteLLM_Params
 from litellm.utils import get_valid_models


@ -23,15 +24,20 @@ def _check_wildcard_routing(model: str) -> bool:
    return False


-def get_provider_models(provider: str) -> Optional[List[str]]:
+def get_provider_models(
+    provider: str, litellm_params: Optional[LiteLLM_Params] = None
+) -> Optional[List[str]]:
    """
    Returns the list of known models by provider
    """
    if provider == "*":
-        return get_valid_models()
+        return get_valid_models(litellm_params=litellm_params)

    if provider in litellm.models_by_provider:
-        provider_models = copy.deepcopy(litellm.models_by_provider[provider])
+        provider_models = get_valid_models(
+            custom_llm_provider=provider, litellm_params=litellm_params
+        )
+        # provider_models = copy.deepcopy(litellm.models_by_provider[provider])
        for idx, _model in enumerate(provider_models):
            if provider not in _model:
                provider_models[idx] = f"{provider}/{_model}"
@ -118,6 +124,7 @@ def get_complete_model_list(
    user_model: Optional[str],
    infer_model_from_keys: Optional[bool],
    return_wildcard_routes: Optional[bool] = False,
+    llm_router: Optional[Router] = None,
 ) -> List[str]:
    """Logic for returning complete model list for a given key + team pair"""

@ -143,19 +150,25 @@ def get_complete_model_list(
            unique_models.update(valid_models)

    all_wildcard_models = _get_wildcard_models(
-        unique_models=unique_models, return_wildcard_routes=return_wildcard_routes
+        unique_models=unique_models,
+        return_wildcard_routes=return_wildcard_routes,
+        llm_router=llm_router,
    )

    return list(unique_models) + all_wildcard_models


-def get_known_models_from_wildcard(wildcard_model: str) -> List[str]:
+def get_known_models_from_wildcard(
+    wildcard_model: str, litellm_params: Optional[LiteLLM_Params] = None
+) -> List[str]:
    try:
        provider, model = wildcard_model.split("/", 1)
    except ValueError:  # safely fail
        return []
    # get all known provider models
-    wildcard_models = get_provider_models(provider=provider)
+    wildcard_models = get_provider_models(
+        provider=provider, litellm_params=litellm_params
+    )
    if wildcard_models is None:
        return []
    if model == "*":
@ -172,7 +185,9 @@ def get_known_models_from_wildcard(wildcard_model: str) -> List[str]:


 def _get_wildcard_models(
-    unique_models: Set[str], return_wildcard_routes: Optional[bool] = False
+    unique_models: Set[str],
+    return_wildcard_routes: Optional[bool] = False,
+    llm_router: Optional[Router] = None,
 ) -> List[str]:
    models_to_remove = set()
    all_wildcard_models = []
@ -183,12 +198,25 @@ def _get_wildcard_models(
            ):  # will add the wildcard route to the list eg: anthropic/*.
                all_wildcard_models.append(model)

-            # get all known provider models
-            wildcard_models = get_known_models_from_wildcard(wildcard_model=model)
+            ## get litellm params from model
+            if llm_router is not None:
+                model_list = llm_router.get_model_list(model_name=model)
+                if model_list is not None:
+                    for router_model in model_list:
+                        wildcard_models = get_known_models_from_wildcard(
+                            wildcard_model=model,
+                            litellm_params=LiteLLM_Params(
+                                **router_model["litellm_params"]  # type: ignore
+                            ),
+                        )
+                        all_wildcard_models.extend(wildcard_models)
+            else:
+                # get all known provider models
+                wildcard_models = get_known_models_from_wildcard(wildcard_model=model)

-            if wildcard_models is not None:
-                models_to_remove.add(model)
-                all_wildcard_models.extend(wildcard_models)
+                if wildcard_models is not None:
+                    models_to_remove.add(model)
+                    all_wildcard_models.extend(wildcard_models)

    for model in models_to_remove:
        unique_models.remove(model)
--- a/litellm/proxy/custom_prompt_management.py
+++ b/litellm/proxy/custom_prompt_management.py
@ -12,7 +12,7 @@ class X42PromptManagement(CustomPromptManagement):
        model: str,
        messages: List[AllMessageValues],
        non_default_params: dict,
-        prompt_id: str,
+        prompt_id: Optional[str],
        prompt_variables: Optional[dict],
        dynamic_callback_params: StandardCallbackDynamicParams,
    ) -> Tuple[str, List[AllMessageValues], dict]:
--- a/litellm/proxy/pass_through_endpoints/llm_passthrough_endpoints.py
+++ b/litellm/proxy/pass_through_endpoints/llm_passthrough_endpoints.py
@ -6,6 +6,7 @@ Provider-specific Pass-Through Endpoints
 Use litellm with Anthropic SDK, Vertex AI SDK, Cohere SDK, etc.
 """

+import os
 from typing import Optional

 import httpx
@ -43,6 +44,84 @@ def create_request_copy(request: Request):
    }


+async def llm_passthrough_factory_proxy_route(
+    custom_llm_provider: str,
+    endpoint: str,
+    request: Request,
+    fastapi_response: Response,
+    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
+):
+    """
+    Factory function for creating pass-through endpoints for LLM providers.
+    """
+    from litellm.types.utils import LlmProviders
+    from litellm.utils import ProviderConfigManager
+
+    provider_config = ProviderConfigManager.get_provider_model_info(
+        provider=LlmProviders(custom_llm_provider),
+        model=None,
+    )
+    if provider_config is None:
+        raise HTTPException(
+            status_code=404, detail=f"Provider {custom_llm_provider} not found"
+        )
+    base_target_url = provider_config.get_api_base()
+
+    if base_target_url is None:
+        raise HTTPException(
+            status_code=404, detail=f"Provider {custom_llm_provider} api base not found"
+        )
+
+    encoded_endpoint = httpx.URL(endpoint).path
+
+    # Ensure endpoint starts with '/' for proper URL construction
+    if not encoded_endpoint.startswith("/"):
+        encoded_endpoint = "/" + encoded_endpoint
+
+    # Construct the full target URL using httpx
+    base_url = httpx.URL(base_target_url)
+    updated_url = base_url.copy_with(path=encoded_endpoint)
+
+    # Add or update query parameters
+    provider_api_key = passthrough_endpoint_router.get_credentials(
+        custom_llm_provider=custom_llm_provider,
+        region_name=None,
+    )
+
+    auth_headers = provider_config.validate_environment(
+        headers={},
+        model="",
+        messages=[],
+        optional_params={},
+        litellm_params={},
+        api_key=provider_api_key,
+        api_base=base_target_url,
+    )
+
+    ## check for streaming
+    is_streaming_request = False
+    # anthropic is streaming when 'stream' = True is in the body
+    if request.method == "POST":
+        _request_body = await request.json()
+        if _request_body.get("stream"):
+            is_streaming_request = True
+
+    ## CREATE PASS-THROUGH
+    endpoint_func = create_pass_through_route(
+        endpoint=endpoint,
+        target=str(updated_url),
+        custom_headers=auth_headers,
+    )  # dynamically construct pass-through endpoint based on incoming path
+    received_value = await endpoint_func(
+        request,
+        fastapi_response,
+        user_api_key_dict,
+        stream=is_streaming_request,  # type: ignore
+    )
+
+    return received_value
+
+
@router.api_route(
    "/gemini/{endpoint:path}",
    methods=["GET", "POST", "PUT", "DELETE", "PATCH"],
@ -162,6 +241,84 @@ async def cohere_proxy_route(
    return received_value


+@router.api_route(
+    "/vllm/{endpoint:path}",
+    methods=["GET", "POST", "PUT", "DELETE", "PATCH"],
+    tags=["VLLM Pass-through", "pass-through"],
+)
+async def vllm_proxy_route(
+    endpoint: str,
+    request: Request,
+    fastapi_response: Response,
+    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
+):
+    """
+    [Docs](https://docs.litellm.ai/docs/pass_through/vllm)
+    """
+    return await llm_passthrough_factory_proxy_route(
+        endpoint=endpoint,
+        request=request,
+        fastapi_response=fastapi_response,
+        user_api_key_dict=user_api_key_dict,
+        custom_llm_provider="vllm",
+    )
+
+
+@router.api_route(
+    "/mistral/{endpoint:path}",
+    methods=["GET", "POST", "PUT", "DELETE", "PATCH"],
+    tags=["Mistral Pass-through", "pass-through"],
+)
+async def mistral_proxy_route(
+    endpoint: str,
+    request: Request,
+    fastapi_response: Response,
+    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
+):
+    """
+    [Docs](https://docs.litellm.ai/docs/anthropic_completion)
+    """
+    base_target_url = os.getenv("MISTRAL_API_BASE") or "https://api.mistral.ai"
+    encoded_endpoint = httpx.URL(endpoint).path
+
+    # Ensure endpoint starts with '/' for proper URL construction
+    if not encoded_endpoint.startswith("/"):
+        encoded_endpoint = "/" + encoded_endpoint
+
+    # Construct the full target URL using httpx
+    base_url = httpx.URL(base_target_url)
+    updated_url = base_url.copy_with(path=encoded_endpoint)
+
+    # Add or update query parameters
+    mistral_api_key = passthrough_endpoint_router.get_credentials(
+        custom_llm_provider="mistral",
+        region_name=None,
+    )
+
+    ## check for streaming
+    is_streaming_request = False
+    # anthropic is streaming when 'stream' = True is in the body
+    if request.method == "POST":
+        _request_body = await request.json()
+        if _request_body.get("stream"):
+            is_streaming_request = True
+
+    ## CREATE PASS-THROUGH
+    endpoint_func = create_pass_through_route(
+        endpoint=endpoint,
+        target=str(updated_url),
+        custom_headers={"Authorization": "Bearer {}".format(mistral_api_key)},
+    )  # dynamically construct pass-through endpoint based on incoming path
+    received_value = await endpoint_func(
+        request,
+        fastapi_response,
+        user_api_key_dict,
+        stream=is_streaming_request,  # type: ignore
+    )
+
+    return received_value
+
+
@router.api_route(
    "/anthropic/{endpoint:path}",
    methods=["GET", "POST", "PUT", "DELETE", "PATCH"],
--- a/litellm/proxy/pass_through_endpoints/llm_provider_handlers/base_passthrough_logging_handler.py
+++ b/litellm/proxy/pass_through_endpoints/llm_provider_handlers/base_passthrough_logging_handler.py
@ -0,0 +1,219 @@
+import json
+from datetime import datetime
+from typing import TYPE_CHECKING, Any, List, Optional, Union
+
+import httpx
+
+import litellm
+from litellm._logging import verbose_proxy_logger
+from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
+from litellm.litellm_core_utils.litellm_logging import (
+    get_standard_logging_object_payload,
+)
+from litellm.llms.base_llm.chat.transformation import BaseConfig
+from litellm.proxy._types import PassThroughEndpointLoggingTypedDict
+from litellm.proxy.auth.auth_utils import get_end_user_id_from_request_body
+from litellm.proxy.pass_through_endpoints.types import PassthroughStandardLoggingPayload
+from litellm.types.utils import LlmProviders, ModelResponse, TextCompletionResponse
+
+if TYPE_CHECKING:
+    from ..success_handler import PassThroughEndpointLogging
+    from ..types import EndpointType
+else:
+    PassThroughEndpointLogging = Any
+    EndpointType = Any
+
+from abc import ABC, abstractmethod
+
+
+class BasePassthroughLoggingHandler(ABC):
+    @property
+    @abstractmethod
+    def llm_provider_name(self) -> LlmProviders:
+        pass
+
+    @abstractmethod
+    def get_provider_config(self, model: str) -> BaseConfig:
+        pass
+
+    def passthrough_chat_handler(
+        self,
+        httpx_response: httpx.Response,
+        response_body: dict,
+        logging_obj: LiteLLMLoggingObj,
+        url_route: str,
+        result: str,
+        start_time: datetime,
+        end_time: datetime,
+        cache_hit: bool,
+        request_body: dict,
+        **kwargs,
+    ) -> PassThroughEndpointLoggingTypedDict:
+        """
+        Transforms LLM response to OpenAI response, generates a standard logging object so downstream logging can be handled
+        """
+        model = request_body.get("model", response_body.get("model", ""))
+        provider_config = self.get_provider_config(model=model)
+        litellm_model_response: ModelResponse = provider_config.transform_response(
+            raw_response=httpx_response,
+            model_response=litellm.ModelResponse(),
+            model=model,
+            messages=[],
+            logging_obj=logging_obj,
+            optional_params={},
+            api_key="",
+            request_data={},
+            encoding=litellm.encoding,
+            json_mode=False,
+            litellm_params={},
+        )
+
+        kwargs = self._create_response_logging_payload(
+            litellm_model_response=litellm_model_response,
+            model=model,
+            kwargs=kwargs,
+            start_time=start_time,
+            end_time=end_time,
+            logging_obj=logging_obj,
+        )
+
+        return {
+            "result": litellm_model_response,
+            "kwargs": kwargs,
+        }
+
+    def _get_user_from_metadata(
+        self,
+        passthrough_logging_payload: PassthroughStandardLoggingPayload,
+    ) -> Optional[str]:
+        request_body = passthrough_logging_payload.get("request_body")
+        if request_body:
+            return get_end_user_id_from_request_body(request_body)
+        return None
+
+    def _create_response_logging_payload(
+        self,
+        litellm_model_response: Union[ModelResponse, TextCompletionResponse],
+        model: str,
+        kwargs: dict,
+        start_time: datetime,
+        end_time: datetime,
+        logging_obj: LiteLLMLoggingObj,
+    ) -> dict:
+        """
+        Create the standard logging object for Generic LLM passthrough
+
+        handles streaming and non-streaming responses
+        """
+
+        try:
+            response_cost = litellm.completion_cost(
+                completion_response=litellm_model_response,
+                model=model,
+            )
+
+            kwargs["response_cost"] = response_cost
+            kwargs["model"] = model
+            passthrough_logging_payload: Optional[PassthroughStandardLoggingPayload] = (  # type: ignore
+                kwargs.get("passthrough_logging_payload")
+            )
+            if passthrough_logging_payload:
+                user = self._get_user_from_metadata(
+                    passthrough_logging_payload=passthrough_logging_payload,
+                )
+                if user:
+                    kwargs.setdefault("litellm_params", {})
+                    kwargs["litellm_params"].update(
+                        {"proxy_server_request": {"body": {"user": user}}}
+                    )
+
+            # Make standard logging object for Anthropic
+            standard_logging_object = get_standard_logging_object_payload(
+                kwargs=kwargs,
+                init_response_obj=litellm_model_response,
+                start_time=start_time,
+                end_time=end_time,
+                logging_obj=logging_obj,
+                status="success",
+            )
+
+            # pretty print standard logging object
+            verbose_proxy_logger.debug(
+                "standard_logging_object= %s",
+                json.dumps(standard_logging_object, indent=4),
+            )
+            kwargs["standard_logging_object"] = standard_logging_object
+
+            # set litellm_call_id to logging response object
+            litellm_model_response.id = logging_obj.litellm_call_id
+            litellm_model_response.model = model
+            logging_obj.model_call_details["model"] = model
+            return kwargs
+        except Exception as e:
+            verbose_proxy_logger.exception(
+                "Error creating LLM passthrough response logging payload: %s", e
+            )
+            return kwargs
+
+    @abstractmethod
+    def _build_complete_streaming_response(
+        self,
+        all_chunks: List[str],
+        litellm_logging_obj: LiteLLMLoggingObj,
+        model: str,
+    ) -> Optional[Union[ModelResponse, TextCompletionResponse]]:
+        """
+        Builds complete response from raw chunks
+
+        - Converts str chunks to generic chunks
+        - Converts generic chunks to litellm chunks (OpenAI format)
+        - Builds complete response from litellm chunks
+        """
+        pass
+
+    def _handle_logging_llm_collected_chunks(
+        self,
+        litellm_logging_obj: LiteLLMLoggingObj,
+        passthrough_success_handler_obj: PassThroughEndpointLogging,
+        url_route: str,
+        request_body: dict,
+        endpoint_type: EndpointType,
+        start_time: datetime,
+        all_chunks: List[str],
+        end_time: datetime,
+    ) -> PassThroughEndpointLoggingTypedDict:
+        """
+        Takes raw chunks from Anthropic passthrough endpoint and logs them in litellm callbacks
+
+        - Builds complete response from chunks
+        - Creates standard logging object
+        - Logs in litellm callbacks
+        """
+
+        model = request_body.get("model", "")
+        complete_streaming_response = self._build_complete_streaming_response(
+            all_chunks=all_chunks,
+            litellm_logging_obj=litellm_logging_obj,
+            model=model,
+        )
+        if complete_streaming_response is None:
+            verbose_proxy_logger.error(
+                "Unable to build complete streaming response for Anthropic passthrough endpoint, not logging..."
+            )
+            return {
+                "result": None,
+                "kwargs": {},
+            }
+        kwargs = self._create_response_logging_payload(
+            litellm_model_response=complete_streaming_response,
+            model=model,
+            kwargs={},
+            start_time=start_time,
+            end_time=end_time,
+            logging_obj=litellm_logging_obj,
+        )
+
+        return {
+            "result": complete_streaming_response,
+            "kwargs": kwargs,
+        }
--- a/litellm/proxy/pass_through_endpoints/llm_provider_handlers/cohere_passthrough_logging_handler.py
+++ b/litellm/proxy/pass_through_endpoints/llm_provider_handlers/cohere_passthrough_logging_handler.py
@ -0,0 +1,56 @@
+from typing import List, Optional, Union
+
+from litellm import stream_chunk_builder
+from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
+from litellm.litellm_core_utils.streaming_handler import CustomStreamWrapper
+from litellm.llms.base_llm.chat.transformation import BaseConfig
+from litellm.llms.cohere.chat.v2_transformation import CohereV2ChatConfig
+from litellm.llms.cohere.common_utils import (
+    ModelResponseIterator as CohereModelResponseIterator,
+)
+from litellm.types.utils import LlmProviders, ModelResponse, TextCompletionResponse
+
+from .base_passthrough_logging_handler import BasePassthroughLoggingHandler
+
+
+class CoherePassthroughLoggingHandler(BasePassthroughLoggingHandler):
+    @property
+    def llm_provider_name(self) -> LlmProviders:
+        return LlmProviders.COHERE
+
+    def get_provider_config(self, model: str) -> BaseConfig:
+        return CohereV2ChatConfig()
+
+    def _build_complete_streaming_response(
+        self,
+        all_chunks: List[str],
+        litellm_logging_obj: LiteLLMLoggingObj,
+        model: str,
+    ) -> Optional[Union[ModelResponse, TextCompletionResponse]]:
+        cohere_model_response_iterator = CohereModelResponseIterator(
+            streaming_response=None,
+            sync_stream=False,
+        )
+        litellm_custom_stream_wrapper = CustomStreamWrapper(
+            completion_stream=cohere_model_response_iterator,
+            model=model,
+            logging_obj=litellm_logging_obj,
+            custom_llm_provider="cohere",
+        )
+        all_openai_chunks = []
+        for _chunk_str in all_chunks:
+            try:
+                generic_chunk = (
+                    cohere_model_response_iterator.convert_str_chunk_to_generic_chunk(
+                        chunk=_chunk_str
+                    )
+                )
+                litellm_chunk = litellm_custom_stream_wrapper.chunk_creator(
+                    chunk=generic_chunk
+                )
+                if litellm_chunk is not None:
+                    all_openai_chunks.append(litellm_chunk)
+            except (StopIteration, StopAsyncIteration):
+                break
+        complete_streaming_response = stream_chunk_builder(chunks=all_openai_chunks)
+        return complete_streaming_response
--- a/litellm/proxy/pass_through_endpoints/pass_through_endpoints.py
+++ b/litellm/proxy/pass_through_endpoints/pass_through_endpoints.py
@ -683,6 +683,7 @@ async def pass_through_request(  # noqa: PLR0915
                end_time=end_time,
                logging_obj=logging_obj,
                cache_hit=False,
+                request_body=_parsed_body,
                **kwargs,
            )
        )
--- a/litellm/proxy/pass_through_endpoints/success_handler.py
+++ b/litellm/proxy/pass_through_endpoints/success_handler.py
@ -16,10 +16,15 @@ from .llm_provider_handlers.anthropic_passthrough_logging_handler import (
 from .llm_provider_handlers.assembly_passthrough_logging_handler import (
    AssemblyAIPassthroughLoggingHandler,
 )
+from .llm_provider_handlers.cohere_passthrough_logging_handler import (
+    CoherePassthroughLoggingHandler,
+)
 from .llm_provider_handlers.vertex_passthrough_logging_handler import (
    VertexPassthroughLoggingHandler,
 )

+cohere_passthrough_logging_handler = CoherePassthroughLoggingHandler()
+

 class PassThroughEndpointLogging:
    def __init__(self):
@ -32,6 +37,8 @@ class PassThroughEndpointLogging:
        # Anthropic
        self.TRACKED_ANTHROPIC_ROUTES = ["/messages"]

+        # Cohere
+        self.TRACKED_COHERE_ROUTES = ["/v2/chat"]
        self.assemblyai_passthrough_logging_handler = (
            AssemblyAIPassthroughLoggingHandler()
        )
@ -84,6 +91,7 @@ class PassThroughEndpointLogging:
        start_time: datetime,
        end_time: datetime,
        cache_hit: bool,
+        request_body: dict,
        **kwargs,
    ):
        standard_logging_response_object: Optional[
@ -125,6 +133,25 @@ class PassThroughEndpointLogging:
                anthropic_passthrough_logging_handler_result["result"]
            )
            kwargs = anthropic_passthrough_logging_handler_result["kwargs"]
+        elif self.is_cohere_route(url_route):
+            cohere_passthrough_logging_handler_result = (
+                cohere_passthrough_logging_handler.passthrough_chat_handler(
+                    httpx_response=httpx_response,
+                    response_body=response_body or {},
+                    logging_obj=logging_obj,
+                    url_route=url_route,
+                    result=result,
+                    start_time=start_time,
+                    end_time=end_time,
+                    cache_hit=cache_hit,
+                    request_body=request_body,
+                    **kwargs,
+                )
+            )
+            standard_logging_response_object = (
+                cohere_passthrough_logging_handler_result["result"]
+            )
+            kwargs = cohere_passthrough_logging_handler_result["kwargs"]
        elif self.is_assemblyai_route(url_route):
            if (
                AssemblyAIPassthroughLoggingHandler._should_log_request(
@ -173,6 +200,11 @@ class PassThroughEndpointLogging:
                return True
        return False

+    def is_cohere_route(self, url_route: str):
+        for route in self.TRACKED_COHERE_ROUTES:
+            if route in url_route:
+                return True
+
    def is_assemblyai_route(self, url_route: str):
        parsed_url = urlparse(url_route)
        if parsed_url.hostname == "api.assemblyai.com":
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -803,9 +803,9 @@ model_max_budget_limiter = _PROXY_VirtualKeyModelMaxBudgetLimiter(
    dual_cache=user_api_key_cache
 )
 litellm.logging_callback_manager.add_litellm_callback(model_max_budget_limiter)
-redis_usage_cache: Optional[RedisCache] = (
-    None  # redis cache used for tracking spend, tpm/rpm limits
-)
+redis_usage_cache: Optional[
+    RedisCache
+] = None  # redis cache used for tracking spend, tpm/rpm limits
 user_custom_auth = None
 user_custom_key_generate = None
 user_custom_sso = None
@ -1131,9 +1131,9 @@ async def update_cache(  # noqa: PLR0915
        _id = "team_id:{}".format(team_id)
        try:
            # Fetch the existing cost for the given user
-            existing_spend_obj: Optional[LiteLLM_TeamTable] = (
-                await user_api_key_cache.async_get_cache(key=_id)
-            )
+            existing_spend_obj: Optional[
+                LiteLLM_TeamTable
+            ] = await user_api_key_cache.async_get_cache(key=_id)
            if existing_spend_obj is None:
                # do nothing if team not in api key cache
                return
@ -2812,9 +2812,9 @@ async def initialize(  # noqa: PLR0915
        user_api_base = api_base
        dynamic_config[user_model]["api_base"] = api_base
    if api_version:
-        os.environ["AZURE_API_VERSION"] = (
-            api_version  # set this for azure - litellm can read this from the env
-        )
+        os.environ[
+            "AZURE_API_VERSION"
+        ] = api_version  # set this for azure - litellm can read this from the env
    if max_tokens:  # model-specific param
        dynamic_config[user_model]["max_tokens"] = max_tokens
    if temperature:  # model-specific param
@ -3316,6 +3316,7 @@ async def model_list(
        user_model=user_model,
        infer_model_from_keys=general_settings.get("infer_model_from_keys", False),
        return_wildcard_routes=return_wildcard_routes,
+        llm_router=llm_router,
    )

    return dict(
@ -7758,9 +7759,9 @@ async def get_config_list(
                            hasattr(sub_field_info, "description")
                            and sub_field_info.description is not None
                        ):
-                            nested_fields[idx].field_description = (
-                                sub_field_info.description
-                            )
+                            nested_fields[
+                                idx
+                            ].field_description = sub_field_info.description
                        idx += 1

                    _stored_in_db = None
--- a/litellm/types/integrations/anthropic_cache_control_hook.py
+++ b/litellm/types/integrations/anthropic_cache_control_hook.py
@ -0,0 +1,17 @@
+from typing import Literal, Optional, TypedDict, Union
+
+from litellm.types.llms.openai import ChatCompletionCachedContent
+
+
+class CacheControlMessageInjectionPoint(TypedDict):
+    """Type for message-level injection points."""
+
+    location: Literal["message"]
+    role: Optional[
+        Literal["user", "system", "assistant"]
+    ]  # Optional: target by role (user, system, assistant)
+    index: Optional[Union[int, str]]  # Optional: target by specific index
+    control: Optional[ChatCompletionCachedContent]
+
+
+CacheControlInjectionPoint = CacheControlMessageInjectionPoint
--- a/litellm/types/llms/cohere.py
+++ b/litellm/types/llms/cohere.py
@ -44,3 +44,59 @@ class ChatHistoryChatBot(TypedDict, total=False):
 ChatHistory = List[
    Union[ChatHistorySystem, ChatHistoryChatBot, ChatHistoryUser, ChatHistoryToolResult]
 ]
+
+
+class CohereV2ChatResponseMessageToolCallFunction(TypedDict, total=False):
+    name: str
+    parameters: dict
+
+
+class CohereV2ChatResponseMessageToolCall(TypedDict):
+    id: str
+    type: Literal["function"]
+    function: CohereV2ChatResponseMessageToolCallFunction
+
+
+class CohereV2ChatResponseMessageContent(TypedDict):
+    id: str
+    type: Literal["tool"]
+    tool: str
+
+
+class CohereV2ChatResponseMessage(TypedDict, total=False):
+    role: Required[Literal["assistant"]]
+    tool_calls: List[CohereV2ChatResponseMessageToolCall]
+    tool_plan: str
+    content: List[CohereV2ChatResponseMessageContent]
+    citations: List[dict]
+
+
+class CohereV2ChatResponseUsageBilledUnits(TypedDict, total=False):
+    input_tokens: int
+    output_tokens: int
+    search_units: int
+    classifications: int
+
+
+class CohereV2ChatResponseUsageTokens(TypedDict, total=False):
+    input_tokens: int
+    output_tokens: int
+
+
+class CohereV2ChatResponseUsage(TypedDict, total=False):
+    billed_units: CohereV2ChatResponseUsageBilledUnits
+    tokens: CohereV2ChatResponseUsageTokens
+
+
+class CohereV2ChatResponseLogProbs(TypedDict, total=False):
+    token_ids: Required[List[int]]
+    text: str
+    logprobs: List[float]
+
+
+class CohereV2ChatResponse(TypedDict):
+    id: str
+    finish_reason: str
+    message: CohereV2ChatResponseMessage
+    usage: CohereV2ChatResponseUsage
+    logprobs: CohereV2ChatResponseLogProbs
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -516,9 +516,9 @@ def function_setup(  # noqa: PLR0915
        function_id: Optional[str] = kwargs["id"] if "id" in kwargs else None

        ## DYNAMIC CALLBACKS ##
-        dynamic_callbacks: Optional[List[Union[str, Callable, CustomLogger]]] = (
-            kwargs.pop("callbacks", None)
-        )
+        dynamic_callbacks: Optional[
+            List[Union[str, Callable, CustomLogger]]
+        ] = kwargs.pop("callbacks", None)
        all_callbacks = get_dynamic_callbacks(dynamic_callbacks=dynamic_callbacks)

        if len(all_callbacks) > 0:
@ -1202,9 +1202,9 @@ def client(original_function):  # noqa: PLR0915
                        exception=e,
                        retry_policy=kwargs.get("retry_policy"),
                    )
-                    kwargs["retry_policy"] = (
-                        reset_retry_policy()
-                    )  # prevent infinite loops
+                    kwargs[
+                        "retry_policy"
+                    ] = reset_retry_policy()  # prevent infinite loops
                litellm.num_retries = (
                    None  # set retries to None to prevent infinite loops
                )
@ -3013,16 +3013,16 @@ def get_optional_params(  # noqa: PLR0915
                    True  # so that main.py adds the function call to the prompt
                )
                if "tools" in non_default_params:
-                    optional_params["functions_unsupported_model"] = (
-                        non_default_params.pop("tools")
-                    )
+                    optional_params[
+                        "functions_unsupported_model"
+                    ] = non_default_params.pop("tools")
                    non_default_params.pop(
                        "tool_choice", None
                    )  # causes ollama requests to hang
                elif "functions" in non_default_params:
-                    optional_params["functions_unsupported_model"] = (
-                        non_default_params.pop("functions")
-                    )
+                    optional_params[
+                        "functions_unsupported_model"
+                    ] = non_default_params.pop("functions")
            elif (
                litellm.add_function_to_prompt
            ):  # if user opts to add it to prompt instead
@ -3045,10 +3045,10 @@ def get_optional_params(  # noqa: PLR0915

    if "response_format" in non_default_params:
        if provider_config is not None:
-            non_default_params["response_format"] = (
-                provider_config.get_json_schema_from_pydantic_object(
-                    response_format=non_default_params["response_format"]
-                )
+            non_default_params[
+                "response_format"
+            ] = provider_config.get_json_schema_from_pydantic_object(
+                response_format=non_default_params["response_format"]
            )
        else:
            non_default_params["response_format"] = type_to_response_format_param(
@ -4064,9 +4064,9 @@ def _count_characters(text: str) -> int:


 def get_response_string(response_obj: Union[ModelResponse, ModelResponseStream]) -> str:
-    _choices: Union[List[Union[Choices, StreamingChoices]], List[StreamingChoices]] = (
-        response_obj.choices
-    )
+    _choices: Union[
+        List[Union[Choices, StreamingChoices]], List[StreamingChoices]
+    ] = response_obj.choices

    response_str = ""
    for choice in _choices:
@ -4458,14 +4458,14 @@ def _get_model_info_helper(  # noqa: PLR0915

            if combined_model_name in litellm.model_cost:
                key = combined_model_name
-                _model_info = _get_model_info_from_model_cost(key=key)
+                _model_info = _get_model_info_from_model_cost(key=cast(str, key))
                if not _check_provider_match(
                    model_info=_model_info, custom_llm_provider=custom_llm_provider
                ):
                    _model_info = None
            if _model_info is None and model in litellm.model_cost:
                key = model
-                _model_info = _get_model_info_from_model_cost(key=key)
+                _model_info = _get_model_info_from_model_cost(key=cast(str, key))
                if not _check_provider_match(
                    model_info=_model_info, custom_llm_provider=custom_llm_provider
                ):
@ -4475,21 +4475,21 @@ def _get_model_info_helper(  # noqa: PLR0915
                and combined_stripped_model_name in litellm.model_cost
            ):
                key = combined_stripped_model_name
-                _model_info = _get_model_info_from_model_cost(key=key)
+                _model_info = _get_model_info_from_model_cost(key=cast(str, key))
                if not _check_provider_match(
                    model_info=_model_info, custom_llm_provider=custom_llm_provider
                ):
                    _model_info = None
            if _model_info is None and stripped_model_name in litellm.model_cost:
                key = stripped_model_name
-                _model_info = _get_model_info_from_model_cost(key=key)
+                _model_info = _get_model_info_from_model_cost(key=cast(str, key))
                if not _check_provider_match(
                    model_info=_model_info, custom_llm_provider=custom_llm_provider
                ):
                    _model_info = None
            if _model_info is None and split_model in litellm.model_cost:
                key = split_model
-                _model_info = _get_model_info_from_model_cost(key=key)
+                _model_info = _get_model_info_from_model_cost(key=cast(str, key))
                if not _check_provider_match(
                    model_info=_model_info, custom_llm_provider=custom_llm_provider
                ):
@ -5807,8 +5807,133 @@ def trim_messages(
        return messages


+from litellm.caching.in_memory_cache import InMemoryCache
+
+
+class AvailableModelsCache(InMemoryCache):
+    def __init__(self, ttl_seconds: int = 300, max_size: int = 1000):
+        super().__init__(ttl_seconds, max_size)
+        self._env_hash: Optional[str] = None
+
+    def _get_env_hash(self) -> str:
+        """Create a hash of relevant environment variables"""
+        env_vars = {
+            k: v
+            for k, v in os.environ.items()
+            if k.startswith(("OPENAI", "ANTHROPIC", "AZURE", "AWS"))
+        }
+        return str(hash(frozenset(env_vars.items())))
+
+    def _check_env_changed(self) -> bool:
+        """Check if environment variables have changed"""
+        current_hash = self._get_env_hash()
+        if self._env_hash is None:
+            self._env_hash = current_hash
+            return True
+        return current_hash != self._env_hash
+
+    def _get_cache_key(
+        self,
+        custom_llm_provider: Optional[str],
+        litellm_params: Optional[LiteLLM_Params],
+    ) -> str:
+        valid_str = ""
+
+        if litellm_params is not None:
+            valid_str = litellm_params.model_dump_json()
+        if custom_llm_provider is not None:
+            valid_str = f"{custom_llm_provider}:{valid_str}"
+        return hashlib.sha256(valid_str.encode()).hexdigest()
+
+    def get_cached_model_info(
+        self,
+        custom_llm_provider: Optional[str] = None,
+        litellm_params: Optional[LiteLLM_Params] = None,
+    ) -> Optional[List[str]]:
+        """Get cached model info"""
+        # Check if environment has changed
+        if litellm_params is None and self._check_env_changed():
+            self.cache_dict.clear()
+            return None
+
+        cache_key = self._get_cache_key(custom_llm_provider, litellm_params)
+
+        result = cast(Optional[List[str]], self.get_cache(cache_key))
+
+        if result is not None:
+            return copy.deepcopy(result)
+        return result
+
+    def set_cached_model_info(
+        self,
+        custom_llm_provider: str,
+        litellm_params: Optional[LiteLLM_Params],
+        available_models: List[str],
+    ):
+        """Set cached model info"""
+        cache_key = self._get_cache_key(custom_llm_provider, litellm_params)
+        self.set_cache(cache_key, copy.deepcopy(available_models))
+
+
+# Global cache instance
+_model_cache = AvailableModelsCache()
+
+
+def _infer_valid_provider_from_env_vars(
+    custom_llm_provider: Optional[str] = None,
+) -> List[str]:
+    valid_providers: List[str] = []
+    environ_keys = os.environ.keys()
+    for provider in litellm.provider_list:
+        if custom_llm_provider and provider != custom_llm_provider:
+            continue
+
+        # edge case litellm has together_ai as a provider, it should be togetherai
+        env_provider_1 = provider.replace("_", "")
+        env_provider_2 = provider
+
+        # litellm standardizes expected provider keys to
+        # PROVIDER_API_KEY. Example: OPENAI_API_KEY, COHERE_API_KEY
+        expected_provider_key_1 = f"{env_provider_1.upper()}_API_KEY"
+        expected_provider_key_2 = f"{env_provider_2.upper()}_API_KEY"
+        if (
+            expected_provider_key_1 in environ_keys
+            or expected_provider_key_2 in environ_keys
+        ):
+            # key is set
+            valid_providers.append(provider)
+
+    return valid_providers
+
+
+def _get_valid_models_from_provider_api(
+    provider_config: BaseLLMModelInfo,
+    custom_llm_provider: str,
+    litellm_params: Optional[LiteLLM_Params] = None,
+) -> List[str]:
+    try:
+        cached_result = _model_cache.get_cached_model_info(
+            custom_llm_provider, litellm_params
+        )
+
+        if cached_result is not None:
+            return cached_result
+        models = provider_config.get_models(
+            api_key=litellm_params.api_key if litellm_params is not None else None,
+            api_base=litellm_params.api_base if litellm_params is not None else None,
+        )
+
+        _model_cache.set_cached_model_info(custom_llm_provider, litellm_params, models)
+        return models
+    except Exception as e:
+        verbose_logger.debug(f"Error getting valid models: {e}")
+        return []
+
+
 def get_valid_models(
-    check_provider_endpoint: bool = False, custom_llm_provider: Optional[str] = None
+    check_provider_endpoint: Optional[bool] = None,
+    custom_llm_provider: Optional[str] = None,
+    litellm_params: Optional[LiteLLM_Params] = None,
 ) -> List[str]:
    """
    Returns a list of valid LLMs based on the set environment variables
@ -5819,31 +5944,21 @@ def get_valid_models(
    Returns:
        A list of valid LLMs
    """
+
    try:
+        check_provider_endpoint = (
+            check_provider_endpoint or litellm.check_provider_endpoint
+        )
        # get keys set in .env
-        environ_keys = os.environ.keys()
-        valid_providers = []
+
+        valid_providers: List[str] = []
+        valid_models: List[str] = []
        # for all valid providers, make a list of supported llms
-        valid_models = []

-        for provider in litellm.provider_list:
-            if custom_llm_provider and provider != custom_llm_provider:
-                continue
-
-            # edge case litellm has together_ai as a provider, it should be togetherai
-            env_provider_1 = provider.replace("_", "")
-            env_provider_2 = provider
-
-            # litellm standardizes expected provider keys to
-            # PROVIDER_API_KEY. Example: OPENAI_API_KEY, COHERE_API_KEY
-            expected_provider_key_1 = f"{env_provider_1.upper()}_API_KEY"
-            expected_provider_key_2 = f"{env_provider_2.upper()}_API_KEY"
-            if (
-                expected_provider_key_1 in environ_keys
-                or expected_provider_key_2 in environ_keys
-            ):
-                # key is set
-                valid_providers.append(provider)
+        if custom_llm_provider:
+            valid_providers = [custom_llm_provider]
+        else:
+            valid_providers = _infer_valid_provider_from_env_vars(custom_llm_provider)

        for provider in valid_providers:
            provider_config = ProviderConfigManager.get_provider_model_info(
@ -5856,15 +5971,24 @@ def get_valid_models(

            if provider == "azure":
                valid_models.append("Azure-LLM")
-            elif provider_config is not None and check_provider_endpoint:
-                try:
-                    models = provider_config.get_models()
-                    valid_models.extend(models)
-                except Exception as e:
-                    verbose_logger.debug(f"Error getting valid models: {e}")
+            elif (
+                provider_config is not None
+                and check_provider_endpoint
+                and provider is not None
+            ):
+                valid_models.extend(
+                    _get_valid_models_from_provider_api(
+                        provider_config,
+                        provider,
+                        litellm_params,
+                    )
+                )
            else:
-                models_for_provider = litellm.models_by_provider.get(provider, [])
+                models_for_provider = copy.deepcopy(
+                    litellm.models_by_provider.get(provider, [])
+                )
                valid_models.extend(models_for_provider)
+
        return valid_models
    except Exception as e:
        verbose_logger.debug(f"Error getting valid models: {e}")
@ -6510,7 +6634,12 @@ class ProviderConfigManager:
            return litellm.AnthropicModelInfo()
        elif LlmProviders.XAI == provider:
            return litellm.XAIModelInfo()
+        elif LlmProviders.VLLM == provider:
+            from litellm.llms.vllm.common_utils import (
+                VLLMModelInfo,  # experimental approach, to reduce bloat on __init__.py
+            )

+            return VLLMModelInfo()
        return None

    @staticmethod
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.66.0"
+version = "1.66.1"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@ -118,7 +118,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"

 [tool.commitizen]
-version = "1.66.0"
+version = "1.66.1"
 version_files = [
    "pyproject.toml:^version"
 ]
--- a/tests/litellm/integrations/test_anthropic_cache_control_hook.py
+++ b/tests/litellm/integrations/test_anthropic_cache_control_hook.py
@ -0,0 +1,169 @@
+import datetime
+import json
+import os
+import sys
+import unittest
+from typing import List, Optional, Tuple
+from unittest.mock import ANY, MagicMock, Mock, patch
+
+import httpx
+import pytest
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system-path
+import litellm
+from litellm.integrations.anthropic_cache_control_hook import AnthropicCacheControlHook
+from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
+from litellm.types.llms.openai import AllMessageValues
+from litellm.types.utils import StandardCallbackDynamicParams
+
+
+@pytest.mark.asyncio
+async def test_anthropic_cache_control_hook_system_message():
+    # Use patch.dict to mock environment variables instead of setting them directly
+    with patch.dict(
+        os.environ,
+        {
+            "AWS_ACCESS_KEY_ID": "fake_access_key_id",
+            "AWS_SECRET_ACCESS_KEY": "fake_secret_access_key",
+            "AWS_REGION_NAME": "us-west-2",
+        },
+    ):
+        anthropic_cache_control_hook = AnthropicCacheControlHook()
+        litellm.callbacks = [anthropic_cache_control_hook]
+
+        # Mock response data
+        mock_response = MagicMock()
+        mock_response.json.return_value = {
+            "output": {
+                "message": {
+                    "role": "assistant",
+                    "content": "Here is my analysis of the key terms and conditions...",
+                }
+            },
+            "stopReason": "stop_sequence",
+            "usage": {
+                "inputTokens": 100,
+                "outputTokens": 200,
+                "totalTokens": 300,
+                "cacheReadInputTokens": 100,
+                "cacheWriteInputTokens": 200,
+            },
+        }
+        mock_response.status_code = 200
+
+        # Mock AsyncHTTPHandler.post method
+        client = AsyncHTTPHandler()
+        with patch.object(client, "post", return_value=mock_response) as mock_post:
+            response = await litellm.acompletion(
+                model="bedrock/anthropic.claude-3-5-haiku-20241022-v1:0",
+                messages=[
+                    {
+                        "role": "system",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": "You are an AI assistant tasked with analyzing legal documents.",
+                            },
+                            {
+                                "type": "text",
+                                "text": "Here is the full text of a complex legal agreement",
+                            },
+                        ],
+                    },
+                    {
+                        "role": "user",
+                        "content": "what are the key terms and conditions in this agreement?",
+                    },
+                ],
+                cache_control_injection_points=[
+                    {
+                        "location": "message",
+                        "role": "system",
+                    },
+                ],
+                client=client,
+            )
+
+            mock_post.assert_called_once()
+            request_body = json.loads(mock_post.call_args.kwargs["data"])
+
+            print("request_body: ", json.dumps(request_body, indent=4))
+
+            # Verify the request body
+            assert request_body["system"][1]["cachePoint"] == {"type": "default"}
+
+
+@pytest.mark.asyncio
+async def test_anthropic_cache_control_hook_user_message():
+    # Use patch.dict to mock environment variables instead of setting them directly
+    with patch.dict(
+        os.environ,
+        {
+            "AWS_ACCESS_KEY_ID": "fake_access_key_id",
+            "AWS_SECRET_ACCESS_KEY": "fake_secret_access_key",
+            "AWS_REGION_NAME": "us-west-2",
+        },
+    ):
+        anthropic_cache_control_hook = AnthropicCacheControlHook()
+        litellm.callbacks = [anthropic_cache_control_hook]
+
+        # Mock response data
+        mock_response = MagicMock()
+        mock_response.json.return_value = {
+            "output": {
+                "message": {
+                    "role": "assistant",
+                    "content": "Here is my analysis of the key terms and conditions...",
+                }
+            },
+            "stopReason": "stop_sequence",
+            "usage": {
+                "inputTokens": 100,
+                "outputTokens": 200,
+                "totalTokens": 300,
+                "cacheReadInputTokens": 100,
+                "cacheWriteInputTokens": 200,
+            },
+        }
+        mock_response.status_code = 200
+
+        # Mock AsyncHTTPHandler.post method
+        client = AsyncHTTPHandler()
+        with patch.object(client, "post", return_value=mock_response) as mock_post:
+            response = await litellm.acompletion(
+                model="bedrock/anthropic.claude-3-5-haiku-20241022-v1:0",
+                messages=[
+                    {
+                        "role": "assistant",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": "You are an AI assistant tasked with analyzing legal documents.",
+                            },
+                        ],
+                    },
+                    {
+                        "role": "user",
+                        "content": "what are the key terms and conditions in this agreement? <very_long_text>",
+                    },
+                ],
+                cache_control_injection_points=[
+                    {
+                        "location": "message",
+                        "role": "user",
+                    },
+                ],
+                client=client,
+            )
+
+            mock_post.assert_called_once()
+            request_body = json.loads(mock_post.call_args.kwargs["data"])
+
+            print("request_body: ", json.dumps(request_body, indent=4))
+
+            # Verify the request body
+            assert request_body["messages"][1]["content"][1]["cachePoint"] == {
+                "type": "default"
+            }
--- a/tests/litellm/integrations/test_custom_prompt_management.py
+++ b/tests/litellm/integrations/test_custom_prompt_management.py
@ -30,7 +30,7 @@ class TestCustomPromptManagement(CustomPromptManagement):
        model: str,
        messages: List[AllMessageValues],
        non_default_params: dict,
-        prompt_id: str,
+        prompt_id: Optional[str],
        prompt_variables: Optional[dict],
        dynamic_callback_params: StandardCallbackDynamicParams,
    ) -> Tuple[str, List[AllMessageValues], dict]:
--- a/tests/litellm/llms/azure/test_azure_common_utils.py
+++ b/tests/litellm/llms/azure/test_azure_common_utils.py
@ -78,6 +78,33 @@ def test_initialize_with_api_key(setup_mocks):
    assert result["azure_ad_token"] is None


+def test_initialize_with_tenant_credentials_env_var(setup_mocks, monkeypatch):
+    monkeypatch.setenv("AZURE_TENANT_ID", "test-tenant-id")
+    monkeypatch.setenv("AZURE_CLIENT_ID", "test-client-id")
+    monkeypatch.setenv("AZURE_CLIENT_SECRET", "test-client-secret")
+
+    result = BaseAzureLLM().initialize_azure_sdk_client(
+        litellm_params={},
+        api_key=None,
+        api_base="https://test.openai.azure.com",
+        model_name="gpt-4",
+        api_version=None,
+        is_async=False,
+    )
+
+    # Verify that get_azure_ad_token_from_entrata_id was called
+    setup_mocks["entrata_token"].assert_called_once_with(
+        tenant_id="test-tenant-id",
+        client_id="test-client-id",
+        client_secret="test-client-secret",
+    )
+
+    # Verify expected result
+    assert result["api_key"] is None
+    assert result["azure_endpoint"] == "https://test.openai.azure.com"
+    assert "azure_ad_token_provider" in result
+
+
 def test_initialize_with_tenant_credentials(setup_mocks):
    # Test with tenant_id, client_id, and client_secret provided
    result = BaseAzureLLM().initialize_azure_sdk_client(
@ -150,8 +177,12 @@ def test_initialize_with_oidc_token(setup_mocks):
    assert result["azure_ad_token"] == "mock-oidc-token"


-def test_initialize_with_enable_token_refresh(setup_mocks):
+def test_initialize_with_enable_token_refresh(setup_mocks, monkeypatch):
+    litellm._turn_on_debug()
    # Enable token refresh
+    monkeypatch.delenv("AZURE_CLIENT_ID", raising=False)
+    monkeypatch.delenv("AZURE_CLIENT_SECRET", raising=False)
+    monkeypatch.delenv("AZURE_TENANT_ID", raising=False)
    setup_mocks["litellm"].enable_azure_ad_token_refresh = True

    # Test with token refresh enabled
@ -171,8 +202,11 @@ def test_initialize_with_enable_token_refresh(setup_mocks):
    assert "azure_ad_token_provider" in result


-def test_initialize_with_token_refresh_error(setup_mocks):
+def test_initialize_with_token_refresh_error(setup_mocks, monkeypatch):
    # Enable token refresh but make it raise an error
+    monkeypatch.delenv("AZURE_CLIENT_ID", raising=False)
+    monkeypatch.delenv("AZURE_CLIENT_SECRET", raising=False)
+    monkeypatch.delenv("AZURE_TENANT_ID", raising=False)
    setup_mocks["litellm"].enable_azure_ad_token_refresh = True
    setup_mocks["token_provider"].side_effect = ValueError("Token provider error")

--- a/tests/litellm_utils_tests/test_utils.py
+++ b/tests/litellm_utils_tests/test_utils.py
@ -41,8 +41,10 @@ from unittest.mock import AsyncMock, MagicMock, patch


 # Assuming your trim_messages, shorten_message_to_fit_limit, and get_token_count functions are all in a module named 'message_utils'
-
-
+@pytest.fixture(autouse=True)
+def reset_mock_cache():
+    from litellm.utils import _model_cache
+    _model_cache.flush_cache()
 # Test 1: Check trimming of normal message
 def test_basic_trimming():
    messages = [
@ -1539,6 +1541,7 @@ def test_get_valid_models_fireworks_ai(monkeypatch):
        litellm.module_level_client, "get", return_value=mock_response
    ) as mock_post:
        valid_models = get_valid_models(check_provider_endpoint=True)
+        print("valid_models", valid_models)
        mock_post.assert_called_once()
        assert (
            "fireworks_ai/accounts/fireworks/models/llama-3.1-8b-instruct"
@ -2122,3 +2125,58 @@ def test_get_provider_audio_transcription_config():
        config = ProviderConfigManager.get_provider_audio_transcription_config(
            model="whisper-1", provider=provider
        )
+
+
+def test_get_valid_models_from_provider():
+    """
+    Test that get_valid_models returns the correct models for a given provider
+    """
+    from litellm.utils import get_valid_models
+
+    valid_models = get_valid_models(custom_llm_provider="openai")
+    assert len(valid_models) > 0
+    assert "gpt-4o-mini" in valid_models
+
+    print("Valid models: ", valid_models)
+    valid_models.remove("gpt-4o-mini")
+    assert "gpt-4o-mini" not in valid_models
+
+    valid_models = get_valid_models(custom_llm_provider="openai")
+    assert len(valid_models) > 0
+    assert "gpt-4o-mini" in valid_models
+
+
+
+def test_get_valid_models_from_provider_cache_invalidation(monkeypatch):
+    """
+    Test that get_valid_models returns the correct models for a given provider
+    """
+    from litellm.utils import _model_cache
+
+    monkeypatch.setenv("OPENAI_API_KEY", "123")
+
+    _model_cache.set_cached_model_info("openai", litellm_params=None, available_models=["gpt-4o-mini"])
+    monkeypatch.delenv("OPENAI_API_KEY")
+
+    assert _model_cache.get_cached_model_info("openai") is None
+
+
+
+def test_get_valid_models_from_dynamic_api_key():
+    """
+    Test that get_valid_models returns the correct models for a given provider
+    """
+    from litellm.utils import get_valid_models
+    from litellm.types.router import CredentialLiteLLMParams
+
+    creds = CredentialLiteLLMParams(api_key="123")
+
+    valid_models = get_valid_models(custom_llm_provider="anthropic", litellm_params=creds, check_provider_endpoint=True)
+    assert len(valid_models) == 0
+
+    creds = CredentialLiteLLMParams(api_key=os.getenv("ANTHROPIC_API_KEY"))
+    valid_models = get_valid_models(custom_llm_provider="anthropic", litellm_params=creds, check_provider_endpoint=True)
+    assert len(valid_models) > 0
+    assert "anthropic/claude-3-7-sonnet-20250219" in valid_models
+    
+    
--- a/tests/llm_translation/test_optional_params.py
+++ b/tests/llm_translation/test_optional_params.py
@ -1449,3 +1449,13 @@ def test_anthropic_unified_reasoning_content(model, provider):
    )
    assert optional_params["thinking"] == {"type": "enabled", "budget_tokens": 4096}

+
+
+def test_azure_response_format(monkeypatch):
+    monkeypatch.setenv("AZURE_API_VERSION", "2025-02-01")
+    optional_params = get_optional_params(
+        model="azure/gpt-4o-mini",
+        custom_llm_provider="azure",
+        response_format={"type": "json_object"},
+    )
+    assert optional_params["response_format"] == {"type": "json_object"}
--- a/tests/local_testing/test_exceptions.py
+++ b/tests/local_testing/test_exceptions.py
@ -498,11 +498,11 @@ def test_completion_bedrock_invalid_role_exception():
            == "litellm.BadRequestError: Invalid Message passed in {'role': 'very-bad-role', 'content': 'hello'}"
        )

-
+@pytest.mark.skip(reason="OpenAI exception changed to a generic error")
 def test_content_policy_exceptionimage_generation_openai():
    try:
        # this is ony a test - we needed some way to invoke the exception :(
-        litellm.set_verbose = True
+        litellm._turn_on_debug()
        response = litellm.image_generation(
            prompt="where do i buy lethal drugs from", model="dall-e-3"
        )
--- a/tests/logging_callback_tests/test_unit_tests_init_callbacks.py
+++ b/tests/logging_callback_tests/test_unit_tests_init_callbacks.py
@ -33,6 +33,7 @@ from litellm.integrations.opik.opik import OpikLogger
 from litellm.integrations.opentelemetry import OpenTelemetry
 from litellm.integrations.mlflow import MlflowLogger
 from litellm.integrations.argilla import ArgillaLogger
+from litellm.integrations.anthropic_cache_control_hook import AnthropicCacheControlHook
 from litellm.integrations.langfuse.langfuse_prompt_management import (
    LangfusePromptManagement,
 )
@ -73,6 +74,7 @@ callback_class_str_to_classType = {
    "otel": OpenTelemetry,
    "pagerduty": PagerDutyAlerting,
    "gcs_pubsub": GcsPubSubLogger,
+    "anthropic_cache_control_hook": AnthropicCacheControlHook,
 }

 expected_env_vars = {
--- a/ui/litellm-dashboard/out/404.html
+++ b/ui/litellm-dashboard/out/404.html
--- a/ui/litellm-dashboard/out/_next/static/chunks/250-ef859ae76fcc7788.js
+++ b/ui/litellm-dashboard/out/_next/static/chunks/250-ef859ae76fcc7788.js
--- a/ui/litellm-dashboard/out/_next/static/chunks/app/page-0f22a8aad3780430.js
+++ b/ui/litellm-dashboard/out/_next/static/chunks/app/page-0f22a8aad3780430.js
--- a/ui/litellm-dashboard/out/_next/static/chunks/app/page-0f39fb1d0bde3c55.js
+++ b/ui/litellm-dashboard/out/_next/static/chunks/app/page-0f39fb1d0bde3c55.js
--- a/ui/litellm-dashboard/out/_next/static/cxKWUnb1loD53gKRUqiFm/_buildManifest.js
+++ b/ui/litellm-dashboard/out/_next/static/cxKWUnb1loD53gKRUqiFm/_buildManifest.js
--- a/ui/litellm-dashboard/out/_next/static/cxKWUnb1loD53gKRUqiFm/_ssgManifest.js
+++ b/ui/litellm-dashboard/out/_next/static/cxKWUnb1loD53gKRUqiFm/_ssgManifest.js
--- a/ui/litellm-dashboard/out/index.html
+++ b/ui/litellm-dashboard/out/index.html
@ -1 +1 @@
-<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-75a5453f51d60261.js"/><script src="/ui/_next/static/chunks/fd9d1056-524b80e1a6b8bb06.js" async=""></script><script src="/ui/_next/static/chunks/117-87ec698bfca6820e.js" async=""></script><script src="/ui/_next/static/chunks/main-app-475d6efe4080647d.js" async=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-42372ed130431b0a.js" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-75a5453f51d60261.js" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/a34f9d1faa5f3315-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/86f6cc749f6b8493.css\",\"style\"]\n3:HL[\"/ui/_next/static/css/61b39adb1cb75213.css\",\"style\"]\n"])</script><script>self.__next_f.push([1,"4:I[12846,[],\"\"]\n6:I[19107,[],\"ClientPageRoot\"]\n7:I[98621,[\"665\",\"static/chunks/3014691f-0b72c78cfebbd712.js\",\"990\",\"static/chunks/13b76428-ebdf3012af0e4489.js\",\"42\",\"static/chunks/42-6810261f4d6c8bbf.js\",\"261\",\"static/chunks/261-d4b99bc9f53d4ef3.js\",\"899\",\"static/chunks/899-8d832fe7c09b2afe.js\",\"817\",\"static/chunks/817-c8c94d51d28af935.js\",\"250\",\"static/chunks/250-f21e8c1de1717077.js\",\"699\",\"static/chunks/699-b82671c2a3a100c7.js\",\"931\",\"static/chunks/app/page-0f39fb1d0bde3c55.js\"],\"default\",1]\n8:I[4707,[],\"\"]\n9:I[36423,[],\"\"]\nb:I[61060,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"0:[\"$\",\"$L4\",null,{\"buildId\":\"eKo9wJb6ZFjAy85GaOeQ6\",\"assetPrefix\":\"/ui\",\"urlParts\":[\"\",\"\"],\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[[\"$L5\",[\"$\",\"$L6\",null,{\"props\":{\"params\":{},\"searchParams\":{}},\"Component\":\"$7\"}],null],null],null]},[[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/86f6cc749f6b8493.css\",\"precedence\":\"next\",\"crossOrigin\":\"$undefined\"}],[\"$\",\"link\",\"1\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/61b39adb1cb75213.css\",\"precedence\":\"next\",\"crossOrigin\":\"$undefined\"}]],[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_cf7686\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[]}]}]}]],null],null],\"couldBeIntercepted\":false,\"initialHead\":[null,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script></body></html>
+<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-75a5453f51d60261.js"/><script src="/ui/_next/static/chunks/fd9d1056-524b80e1a6b8bb06.js" async=""></script><script src="/ui/_next/static/chunks/117-87ec698bfca6820e.js" async=""></script><script src="/ui/_next/static/chunks/main-app-475d6efe4080647d.js" async=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-42372ed130431b0a.js" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-75a5453f51d60261.js" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/a34f9d1faa5f3315-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/86f6cc749f6b8493.css\",\"style\"]\n3:HL[\"/ui/_next/static/css/61b39adb1cb75213.css\",\"style\"]\n"])</script><script>self.__next_f.push([1,"4:I[12846,[],\"\"]\n6:I[19107,[],\"ClientPageRoot\"]\n7:I[37185,[\"665\",\"static/chunks/3014691f-0b72c78cfebbd712.js\",\"990\",\"static/chunks/13b76428-ebdf3012af0e4489.js\",\"42\",\"static/chunks/42-6810261f4d6c8bbf.js\",\"261\",\"static/chunks/261-d4b99bc9f53d4ef3.js\",\"899\",\"static/chunks/899-8d832fe7c09b2afe.js\",\"817\",\"static/chunks/817-c8c94d51d28af935.js\",\"250\",\"static/chunks/250-ef859ae76fcc7788.js\",\"699\",\"static/chunks/699-b82671c2a3a100c7.js\",\"931\",\"static/chunks/app/page-0f22a8aad3780430.js\"],\"default\",1]\n8:I[4707,[],\"\"]\n9:I[36423,[],\"\"]\nb:I[61060,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"0:[\"$\",\"$L4\",null,{\"buildId\":\"cxKWUnb1loD53gKRUqiFm\",\"assetPrefix\":\"/ui\",\"urlParts\":[\"\",\"\"],\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[[\"$L5\",[\"$\",\"$L6\",null,{\"props\":{\"params\":{},\"searchParams\":{}},\"Component\":\"$7\"}],null],null],null]},[[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/86f6cc749f6b8493.css\",\"precedence\":\"next\",\"crossOrigin\":\"$undefined\"}],[\"$\",\"link\",\"1\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/61b39adb1cb75213.css\",\"precedence\":\"next\",\"crossOrigin\":\"$undefined\"}]],[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_cf7686\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[]}]}]}]],null],null],\"couldBeIntercepted\":false,\"initialHead\":[null,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script></body></html>
--- a/ui/litellm-dashboard/out/index.txt
+++ b/ui/litellm-dashboard/out/index.txt
@ -1,7 +1,7 @@
 2:I[19107,[],"ClientPageRoot"]
-3:I[98621,["665","static/chunks/3014691f-0b72c78cfebbd712.js","990","static/chunks/13b76428-ebdf3012af0e4489.js","42","static/chunks/42-6810261f4d6c8bbf.js","261","static/chunks/261-d4b99bc9f53d4ef3.js","899","static/chunks/899-8d832fe7c09b2afe.js","817","static/chunks/817-c8c94d51d28af935.js","250","static/chunks/250-f21e8c1de1717077.js","699","static/chunks/699-b82671c2a3a100c7.js","931","static/chunks/app/page-0f39fb1d0bde3c55.js"],"default",1]
+3:I[37185,["665","static/chunks/3014691f-0b72c78cfebbd712.js","990","static/chunks/13b76428-ebdf3012af0e4489.js","42","static/chunks/42-6810261f4d6c8bbf.js","261","static/chunks/261-d4b99bc9f53d4ef3.js","899","static/chunks/899-8d832fe7c09b2afe.js","817","static/chunks/817-c8c94d51d28af935.js","250","static/chunks/250-ef859ae76fcc7788.js","699","static/chunks/699-b82671c2a3a100c7.js","931","static/chunks/app/page-0f22a8aad3780430.js"],"default",1]
 4:I[4707,[],""]
 5:I[36423,[],""]
-0:["eKo9wJb6ZFjAy85GaOeQ6",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},[["$L1",["$","$L2",null,{"props":{"params":{},"searchParams":{}},"Component":"$3"}],null],null],null]},[[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/86f6cc749f6b8493.css","precedence":"next","crossOrigin":"$undefined"}],["$","link","1",{"rel":"stylesheet","href":"/ui/_next/static/css/61b39adb1cb75213.css","precedence":"next","crossOrigin":"$undefined"}]],["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_cf7686","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[]}]}]}]],null],null],["$L6",null]]]]
+0:["cxKWUnb1loD53gKRUqiFm",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},[["$L1",["$","$L2",null,{"props":{"params":{},"searchParams":{}},"Component":"$3"}],null],null],null]},[[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/86f6cc749f6b8493.css","precedence":"next","crossOrigin":"$undefined"}],["$","link","1",{"rel":"stylesheet","href":"/ui/_next/static/css/61b39adb1cb75213.css","precedence":"next","crossOrigin":"$undefined"}]],["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_cf7686","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[]}]}]}]],null],null],["$L6",null]]]]
 6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
 1:null
--- a/ui/litellm-dashboard/out/model_hub.html
+++ b/ui/litellm-dashboard/out/model_hub.html
--- a/ui/litellm-dashboard/out/model_hub.txt
+++ b/ui/litellm-dashboard/out/model_hub.txt
@ -1,7 +1,7 @@
 2:I[19107,[],"ClientPageRoot"]
-3:I[52829,["42","static/chunks/42-6810261f4d6c8bbf.js","261","static/chunks/261-d4b99bc9f53d4ef3.js","250","static/chunks/250-f21e8c1de1717077.js","699","static/chunks/699-b82671c2a3a100c7.js","418","static/chunks/app/model_hub/page-068a441595bd0fc3.js"],"default",1]
+3:I[52829,["42","static/chunks/42-6810261f4d6c8bbf.js","261","static/chunks/261-d4b99bc9f53d4ef3.js","250","static/chunks/250-ef859ae76fcc7788.js","699","static/chunks/699-b82671c2a3a100c7.js","418","static/chunks/app/model_hub/page-068a441595bd0fc3.js"],"default",1]
 4:I[4707,[],""]
 5:I[36423,[],""]
-0:["eKo9wJb6ZFjAy85GaOeQ6",[[["",{"children":["model_hub",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["model_hub",{"children":["__PAGE__",{},[["$L1",["$","$L2",null,{"props":{"params":{},"searchParams":{}},"Component":"$3"}],null],null],null]},[null,["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","model_hub","children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined"}]],null]},[[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/86f6cc749f6b8493.css","precedence":"next","crossOrigin":"$undefined"}],["$","link","1",{"rel":"stylesheet","href":"/ui/_next/static/css/61b39adb1cb75213.css","precedence":"next","crossOrigin":"$undefined"}]],["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_cf7686","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[]}]}]}]],null],null],["$L6",null]]]]
+0:["cxKWUnb1loD53gKRUqiFm",[[["",{"children":["model_hub",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["model_hub",{"children":["__PAGE__",{},[["$L1",["$","$L2",null,{"props":{"params":{},"searchParams":{}},"Component":"$3"}],null],null],null]},[null,["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","model_hub","children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined"}]],null]},[[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/86f6cc749f6b8493.css","precedence":"next","crossOrigin":"$undefined"}],["$","link","1",{"rel":"stylesheet","href":"/ui/_next/static/css/61b39adb1cb75213.css","precedence":"next","crossOrigin":"$undefined"}]],["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_cf7686","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[]}]}]}]],null],null],["$L6",null]]]]
 6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
 1:null
--- a/ui/litellm-dashboard/out/onboarding.html
+++ b/ui/litellm-dashboard/out/onboarding.html
--- a/ui/litellm-dashboard/out/onboarding.txt
+++ b/ui/litellm-dashboard/out/onboarding.txt
@ -1,7 +1,7 @@
 2:I[19107,[],"ClientPageRoot"]
-3:I[12011,["665","static/chunks/3014691f-0b72c78cfebbd712.js","42","static/chunks/42-6810261f4d6c8bbf.js","899","static/chunks/899-8d832fe7c09b2afe.js","250","static/chunks/250-f21e8c1de1717077.js","461","static/chunks/app/onboarding/page-957334f0c5d5df41.js"],"default",1]
+3:I[12011,["665","static/chunks/3014691f-0b72c78cfebbd712.js","42","static/chunks/42-6810261f4d6c8bbf.js","899","static/chunks/899-8d832fe7c09b2afe.js","250","static/chunks/250-ef859ae76fcc7788.js","461","static/chunks/app/onboarding/page-957334f0c5d5df41.js"],"default",1]
 4:I[4707,[],""]
 5:I[36423,[],""]
-0:["eKo9wJb6ZFjAy85GaOeQ6",[[["",{"children":["onboarding",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["onboarding",{"children":["__PAGE__",{},[["$L1",["$","$L2",null,{"props":{"params":{},"searchParams":{}},"Component":"$3"}],null],null],null]},[null,["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","onboarding","children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined"}]],null]},[[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/86f6cc749f6b8493.css","precedence":"next","crossOrigin":"$undefined"}],["$","link","1",{"rel":"stylesheet","href":"/ui/_next/static/css/61b39adb1cb75213.css","precedence":"next","crossOrigin":"$undefined"}]],["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_cf7686","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[]}]}]}]],null],null],["$L6",null]]]]
+0:["cxKWUnb1loD53gKRUqiFm",[[["",{"children":["onboarding",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["onboarding",{"children":["__PAGE__",{},[["$L1",["$","$L2",null,{"props":{"params":{},"searchParams":{}},"Component":"$3"}],null],null],null]},[null,["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","onboarding","children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined"}]],null]},[[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/86f6cc749f6b8493.css","precedence":"next","crossOrigin":"$undefined"}],["$","link","1",{"rel":"stylesheet","href":"/ui/_next/static/css/61b39adb1cb75213.css","precedence":"next","crossOrigin":"$undefined"}]],["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_cf7686","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[]}]}]}]],null],null],["$L6",null]]]]
 6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
 1:null
--- a/ui/litellm-dashboard/src/components/add_model/advanced_settings.tsx
+++ b/ui/litellm-dashboard/src/components/add_model/advanced_settings.tsx
@ -5,6 +5,7 @@ import { Row, Col, Typography, Card } from "antd";
 import TextArea from "antd/es/input/TextArea";
 import { Team } from "../key_team_helpers/key_list";
 import TeamDropdown from "../common_components/team_dropdown";
+import CacheControlSettings from "./cache_control_settings";
 const { Link } = Typography;

 interface AdvancedSettingsProps {
@ -21,6 +22,7 @@ const AdvancedSettings: React.FC<AdvancedSettingsProps> = ({
  const [form] = Form.useForm();
  const [customPricing, setCustomPricing] = React.useState(false);
  const [pricingModel, setPricingModel] = React.useState<'per_token' | 'per_second'>('per_token');
+  const [showCacheControl, setShowCacheControl] = React.useState(false);

  // Add validation function for numbers
  const validateNumber = (_: any, value: string) => {
@ -83,6 +85,24 @@ const AdvancedSettings: React.FC<AdvancedSettingsProps> = ({
    }
  };

+  const handleCacheControlChange = (checked: boolean) => {
+    setShowCacheControl(checked);
+    if (!checked) {
+      const currentParams = form.getFieldValue('litellm_extra_params');
+      try {
+        let paramsObj = currentParams ? JSON.parse(currentParams) : {};
+        delete paramsObj.cache_control_injection_points;
+        if (Object.keys(paramsObj).length > 0) {
+          form.setFieldValue('litellm_extra_params', JSON.stringify(paramsObj, null, 2));
+        } else {
+          form.setFieldValue('litellm_extra_params', '');
+        }
+      } catch (error) {
+        form.setFieldValue('litellm_extra_params', '');
+      }
+    }
+  };
+
  return (
    <>
      <Accordion className="mt-2 mb-4">
@ -150,6 +170,12 @@ const AdvancedSettings: React.FC<AdvancedSettingsProps> = ({
              </div>
            )}

+            <CacheControlSettings
+              form={form}
+              showCacheControl={showCacheControl}
+              onCacheControlChange={handleCacheControlChange}
+            />
+
            <Form.Item
              label="Use in pass through routes"
              name="use_in_pass_through"
--- a/ui/litellm-dashboard/src/components/add_model/cache_control_settings.tsx
+++ b/ui/litellm-dashboard/src/components/add_model/cache_control_settings.tsx
@ -0,0 +1,159 @@
+import React from "react";
+import { Form, Switch, Select, Input, Typography } from "antd";
+import { PlusOutlined, MinusCircleOutlined } from '@ant-design/icons';
+import NumericalInput from "../shared/numerical_input";
+
+const { Text } = Typography;
+
+interface CacheControlInjectionPoint {
+  location: "message";
+  role?: "user" | "system" | "assistant";
+  index?: number;
+}
+
+interface CacheControlSettingsProps {
+  form: any; // Form instance from parent
+  showCacheControl: boolean;
+  onCacheControlChange: (checked: boolean) => void;
+}
+
+const CacheControlSettings: React.FC<CacheControlSettingsProps> = ({
+  form,
+  showCacheControl,
+  onCacheControlChange,
+}) => {
+  const updateCacheControlPoints = (injectionPoints: CacheControlInjectionPoint[]) => {
+    const currentParams = form.getFieldValue('litellm_extra_params');
+    try {
+      let paramsObj = currentParams ? JSON.parse(currentParams) : {};
+      if (injectionPoints.length > 0) {
+        paramsObj.cache_control_injection_points = injectionPoints;
+      } else {
+        delete paramsObj.cache_control_injection_points;
+      }
+      if (Object.keys(paramsObj).length > 0) {
+        form.setFieldValue('litellm_extra_params', JSON.stringify(paramsObj, null, 2));
+      } else {
+        form.setFieldValue('litellm_extra_params', '');
+      }
+    } catch (error) {
+      console.error('Error updating cache control points:', error);
+    }
+  };
+
+  return (
+    <>
+      <Form.Item
+        label="Cache Control"
+        name="cache_control"
+        valuePropName="checked"
+        className="mb-4"
+        tooltip="Tell litellm where to inject cache control checkpoints. You can specify either by role (to apply to all messages of that role) or by specific message index."
+      >
+        <Switch onChange={onCacheControlChange} className="bg-gray-600" />
+      </Form.Item>
+
+      {showCacheControl && (
+        <div className="ml-6 pl-4 border-l-2 border-gray-200">
+          <Text className="text-sm text-gray-500 block mb-4">
+            Specify either a role (to cache all messages of that role) or a specific message index. 
+            If both are provided, the index takes precedence.
+          </Text>
+          
+          <Form.List
+            name="cache_control_injection_points"
+            initialValue={[{ location: "message" }]}
+          >
+            {(fields, { add, remove }) => (
+              <>
+                {fields.map((field, index) => (
+                  <div key={field.key} className="flex items-center mb-4 gap-4">
+                    <Form.Item
+                      {...field}
+                      label="Type"
+                      name={[field.name, 'location']}
+                      initialValue="message"
+                      className="mb-0"
+                      style={{ width: '180px' }}
+                    >
+                      <Select disabled options={[{ value: 'message', label: 'Message' }]} />
+                    </Form.Item>
+                    
+                    <Form.Item
+                      {...field}
+                      label="Role"
+                      name={[field.name, 'role']}
+                      className="mb-0"
+                      style={{ width: '180px' }}
+                      tooltip="Select a role to cache all messages of this type"
+                    >
+                      <Select
+                        placeholder="Select a role"
+                        allowClear
+                        options={[
+                          { value: 'user', label: 'User' },
+                          { value: 'system', label: 'System' },
+                          { value: 'assistant', label: 'Assistant' },
+                        ]}
+                        onChange={() => {
+                          const values = form.getFieldValue('cache_control_points');
+                          updateCacheControlPoints(values);
+                        }}
+                      />
+                    </Form.Item>
+                    
+                    <Form.Item
+                      {...field}
+                      label="Index"
+                      name={[field.name, 'index']}
+                      className="mb-0"
+                      style={{ width: '180px' }}
+                      tooltip="Specify a specific message index (optional)"
+                    >
+                      <NumericalInput
+                        type="number"
+                        placeholder="Optional"
+                        step={1}
+                        min={0}
+                        onChange={() => {
+                          const values = form.getFieldValue('cache_control_points');
+                          updateCacheControlPoints(values);
+                        }}
+                      />
+                    </Form.Item>
+                    
+                    {fields.length > 1 && (
+                      <MinusCircleOutlined 
+                        className="text-red-500 cursor-pointer text-lg mt-8" 
+                        onClick={() => {
+                          remove(field.name);
+                          setTimeout(() => {
+                            const values = form.getFieldValue('cache_control_points');
+                            updateCacheControlPoints(values);
+                          }, 0);
+                        }}
+                      />
+                    )}
+                  </div>
+                ))}
+                
+                <Form.Item>
+                  <button
+                    type="button"
+                    className="flex items-center justify-center w-full border border-dashed border-gray-300 py-2 px-4 text-gray-600 hover:text-blue-600 hover:border-blue-300 transition-all rounded"
+                    onClick={() => add()}
+                  >
+                    <PlusOutlined className="mr-2" />
+                    Add Injection Point
+                  </button>
+                </Form.Item>
+              </>
+            )}
+          </Form.List>
+        </div>
+      )}
+    </>
+  );
+};
+
+export default CacheControlSettings; 
--- a/ui/litellm-dashboard/src/components/add_model/handle_add_model_submit.tsx
+++ b/ui/litellm-dashboard/src/components/add_model/handle_add_model_submit.tsx
@ -60,7 +60,7 @@ export const prepareModelAddRequest = async (
            continue;
          }
          // Skip the custom_pricing and pricing_model fields as they're only used for UI control
-          if (key === 'custom_pricing' || key === 'pricing_model') {
+          if (key === 'custom_pricing' || key === 'pricing_model' || key === 'cache_control') {
            continue;
          }
          if (key == "model_name") {
--- a/ui/litellm-dashboard/src/components/model_info_view.tsx
+++ b/ui/litellm-dashboard/src/components/model_info_view.tsx
@ -23,6 +23,7 @@ import { getProviderLogoAndName } from "./provider_info_helpers";
 import { getDisplayModelName } from "./view_model/model_name_display";
 import AddCredentialsModal from "./model_add/add_credentials_tab";
 import ReuseCredentialsModal from "./model_add/reuse_credentials";
+import CacheControlSettings from "./add_model/cache_control_settings";

 interface ModelInfoViewProps {
  modelId: string;
@ -57,6 +58,7 @@ export default function ModelInfoView({
  const [isSaving, setIsSaving] = useState(false);
  const [isEditing, setIsEditing] = useState(false);
  const [existingCredential, setExistingCredential] = useState<CredentialItem | null>(null);
+  const [showCacheControl, setShowCacheControl] = useState(false);

  const canEditModel = userRole === "Admin" || modelData.model_info.created_by === userID;
  const isAdmin = userRole === "Admin";
@ -86,6 +88,11 @@ export default function ModelInfoView({
      console.log("modelInfoResponse, ", modelInfoResponse);
      let specificModelData = modelInfoResponse.data[0];
      setLocalModelData(specificModelData);
+      
+      // Check if cache control is enabled
+      if (specificModelData?.litellm_params?.cache_control_injection_points) {
+        setShowCacheControl(true);
+      }
    }
    getExistingCredential();
    getModelInfo();
@ -112,22 +119,31 @@ export default function ModelInfoView({
      if (!accessToken) return;
      setIsSaving(true);
      
+      let updatedLitellmParams = {
+        ...localModelData.litellm_params,
+        model: values.litellm_model_name,
+        api_base: values.api_base,
+        custom_llm_provider: values.custom_llm_provider,
+        organization: values.organization,
+        tpm: values.tpm,
+        rpm: values.rpm,
+        max_retries: values.max_retries,
+        timeout: values.timeout,
+        stream_timeout: values.stream_timeout,
+        input_cost_per_token: values.input_cost / 1_000_000,
+        output_cost_per_token: values.output_cost / 1_000_000,
+      };
+      
+      // Handle cache control settings
+      if (values.cache_control && values.cache_control_injection_points?.length > 0) {
+        updatedLitellmParams.cache_control_injection_points = values.cache_control_injection_points;
+      } else {
+        delete updatedLitellmParams.cache_control_injection_points;
+      }
+      
      const updateData = {
        model_name: values.model_name,
-        litellm_params: {
-          ...localModelData.litellm_params,
-          model: values.litellm_model_name,
-          api_base: values.api_base,
-          custom_llm_provider: values.custom_llm_provider,
-          organization: values.organization,
-          tpm: values.tpm,
-          rpm: values.rpm,
-          max_retries: values.max_retries,
-          timeout: values.timeout,
-          stream_timeout: values.stream_timeout,
-          input_cost_per_token: values.input_cost / 1_000_000,
-          output_cost_per_token: values.output_cost / 1_000_000,
-        },
+        litellm_params: updatedLitellmParams,
        model_info: {
          id: modelId,
        }
@ -139,7 +155,7 @@ export default function ModelInfoView({
        ...localModelData,
        model_name: values.model_name,
        litellm_model_name: values.litellm_model_name,
-        litellm_params: updateData.litellm_params
+        litellm_params: updatedLitellmParams
      };
      
      setLocalModelData(updatedModelData);
@ -337,6 +353,8 @@ export default function ModelInfoView({
                    (localModelData.litellm_params.input_cost_per_token * 1_000_000) : localModelData.model_info?.input_cost_per_token * 1_000_000 || null,
                  output_cost: localModelData.litellm_params?.output_cost_per_token ? 
                    (localModelData.litellm_params.output_cost_per_token * 1_000_000) : localModelData.model_info?.output_cost_per_token * 1_000_000 || null,
+                  cache_control: localModelData.litellm_params?.cache_control_injection_points ? true : false,
+                  cache_control_injection_points: localModelData.litellm_params?.cache_control_injection_points || [],
                }}
                layout="vertical"
                onValuesChange={() => setIsDirty(true)}
@ -499,6 +517,37 @@ export default function ModelInfoView({
                      )}
                    </div>

+                    {/* Cache Control Section */}
+                    {isEditing ? (
+                      <CacheControlSettings 
+                        form={form}
+                        showCacheControl={showCacheControl}
+                        onCacheControlChange={(checked) => setShowCacheControl(checked)}
+                      />
+                    ) : (
+                      <div>
+                        <Text className="font-medium">Cache Control</Text>
+                        <div className="mt-1 p-2 bg-gray-50 rounded">
+                          {localModelData.litellm_params?.cache_control_injection_points ? (
+                            <div>
+                              <p>Enabled</p>
+                              <div className="mt-2">
+                                {localModelData.litellm_params.cache_control_injection_points.map((point: any, i: number) => (
+                                  <div key={i} className="text-sm text-gray-600 mb-1">
+                                    Location: {point.location}, 
+                                    {point.role && <span> Role: {point.role}</span>}
+                                    {point.index !== undefined && <span> Index: {point.index}</span>}
+                                  </div>
+                                ))}
+                              </div>
+                            </div>
+                          ) : (
+                            "Disabled"
+                          )}
+                        </div>
+                      </div>
+                    )}
+
                    <div>
                      <Text className="font-medium">Team ID</Text>
                      <div className="mt-1 p-2 bg-gray-50 rounded">