fix(health.md): add rerank model health check information (#7295)

* fix(health.md): add rerank model health check information * build(model_prices_and_context_window.json): add gemini 2.0 for google ai studio - pricing + commercial rate limits * build(model_prices_and_context_window.json): add gemini-2.0 supports audio output = true * docs(team_model_add.md): clarify allowing teams to add models is an enterprise feature * fix(o1_transformation.py): add support for 'n', 'response_format' and 'stop' params for o1 and 'stream_options' param for o1-mini * build(model_prices_and_context_window.json): add 'supports_system_message' to supporting openai models needed as o1-preview, and o1-mini models don't support 'system message * fix(o1_transformation.py): translate system message based on if o1 model supports it * fix(o1_transformation.py): return 'stream' param support if o1-mini/o1-preview o1 currently doesn't support streaming, but the other model versions do Fixes https://github.com/BerriAI/litellm/issues/7292 * fix(o1_transformation.py): return tool calling/response_format in supported params if model map says so Fixes https://github.com/BerriAI/litellm/issues/7292 * fix: fix linting errors * fix: update '_transform_messages' * fix(o1_transformation.py): fix provider passed for supported param checks * test(base_llm_unit_tests.py): skip test if api takes >5s to respond * fix(utils.py): return false in 'supports_factory' if can't find value * fix(o1_transformation.py): always return stream + stream_options as supported params + handle stream options being passed in for azure o1 * feat(openai.py): support stream faking natively in openai handler Allows o1 calls to be faked for just the "o1" model, allows native streaming for o1-mini, o1-preview Fixes https://github.com/BerriAI/litellm/issues/7292 * fix(openai.py): use inference param instead of original optional param
2025-04-27 03:34:10 +00:00 · 2024-12-18 19:18:10 -08:00 · 2024-12-18 19:18:10 -08:00 · 1a4910f6c0
commit 1a4910f6c0
parent e95820367f
34 changed files with 800 additions and 515 deletions
--- a/docs/my-website/docs/proxy/health.md
+++ b/docs/my-website/docs/proxy/health.md
@ -121,6 +121,20 @@ model_list:
      mode: audio_speech
 ```

+### Rerank Models 
+
+To run rerank health checks, specify the mode as "rerank" in your config for the relevant model.
+
+```yaml
+model_list:
+  - model_name: rerank-english-v3.0
+    litellm_params:
+      model: cohere/rerank-english-v3.0
+      api_key: os.environ/COHERE_API_KEY
+    model_info:
+      mode: rerank
+```
+
 ### Batch Models (Azure Only)

 For Azure models deployed as 'batch' models, set `mode: batch`. 
--- a/docs/my-website/docs/proxy/team_model_add.md
+++ b/docs/my-website/docs/proxy/team_model_add.md
@ -1,4 +1,13 @@
-# Allow Teams to Add Models
+# ✨ Allow Teams to Add Models
+
+:::info
+
+This is an Enterprise feature.
+[Enterprise Pricing](https://www.litellm.ai/#pricing)
+
+[Contact us here to get a free trial](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
+
+:::

 Allow team to add a their own models/key for that project - so any OpenAI call they make uses their OpenAI key.

--- a/litellm/litellm_core_utils/prompt_templates/factory.py
+++ b/litellm/litellm_core_utils/prompt_templates/factory.py
@ -3144,7 +3144,9 @@ def prompt_factory(
        else:
            return gemini_text_image_pt(messages=messages)
    elif custom_llm_provider == "mistral":
-        return litellm.MistralConfig()._transform_messages(messages=messages)
+        return litellm.MistralConfig()._transform_messages(
+            messages=messages, model=model
+        )
    elif custom_llm_provider == "bedrock":
        if "amazon.titan-text" in model:
            return amazon_titan_pt(messages=messages)
--- a/litellm/llms/anthropic/completion/transformation.py
+++ b/litellm/llms/anthropic/completion/transformation.py
@ -260,12 +260,6 @@ class AnthropicTextConfig(BaseConfig):

        return str(prompt)

-    def _transform_messages(
-        self, messages: List[AllMessageValues]
-    ) -> List[AllMessageValues]:
-        "Not required"
-        raise NotImplementedError
-
    def get_model_response_iterator(
        self,
        streaming_response: Union[Iterator[str], AsyncIterator[str], ModelResponse],
--- a/litellm/llms/azure/chat/o1_handler.py
+++ b/litellm/llms/azure/chat/o1_handler.py
@ -57,6 +57,7 @@ class AzureOpenAIO1ChatCompletion(AzureChatCompletion):
        client=None,
    ):
        stream: Optional[bool] = optional_params.pop("stream", False)
+        stream_options: Optional[dict] = optional_params.pop("stream_options", None)
        response = super().completion(
            model,
            messages,
@ -90,6 +91,7 @@ class AzureOpenAIO1ChatCompletion(AzureChatCompletion):
                model=model,
                custom_llm_provider="openai",
                logging_obj=logging_obj,
+                stream_options=stream_options,
            )

            return streaming_response
--- a/litellm/llms/azure_ai/chat/transformation.py
+++ b/litellm/llms/azure_ai/chat/transformation.py
@ -2,11 +2,11 @@ from typing import List, Optional, Tuple

 import litellm
 from litellm._logging import verbose_logger
-from litellm.llms.openai.openai import OpenAIConfig
 from litellm.litellm_core_utils.prompt_templates.common_utils import (
    _audio_or_image_in_message_content,
    convert_content_list_to_str,
 )
+from litellm.llms.openai.openai import OpenAIConfig
 from litellm.secret_managers.main import get_secret_str
 from litellm.types.llms.openai import AllMessageValues
 from litellm.types.utils import ProviderField
@ -33,6 +33,7 @@ class AzureAIStudioConfig(OpenAIConfig):
    def _transform_messages(
        self,
        messages: List[AllMessageValues],
+        model: str,
    ) -> List:
        """
        - Azure AI Studio doesn't support content as a list. This handles:
--- a/litellm/llms/base_llm/chat/transformation.py
+++ b/litellm/llms/base_llm/chat/transformation.py
@ -82,6 +82,14 @@ class BaseConfig(ABC):
            and v is not None
        }

+    def should_fake_stream(
+        self, model: str, custom_llm_provider: Optional[str] = None
+    ) -> bool:
+        """
+        Returns True if the model/provider should fake stream
+        """
+        return False
+
    @abstractmethod
    def get_supported_openai_params(self, model: str) -> list:
        pass
--- a/litellm/llms/clarifai/chat/transformation.py
+++ b/litellm/llms/clarifai/chat/transformation.py
@ -131,11 +131,6 @@ class ClarifaiConfig(BaseConfig):
            headers["Authorization"] = f"Bearer {api_key}"
        return headers

-    def _transform_messages(
-        self, messages: List[AllMessageValues]
-    ) -> List[AllMessageValues]:
-        raise NotImplementedError
-
    def get_error_class(
        self, error_message: str, status_code: int, headers: Union[dict, httpx.Headers]
    ) -> BaseLLMException:
--- a/litellm/llms/cloudflare/chat/transformation.py
+++ b/litellm/llms/cloudflare/chat/transformation.py
@ -158,11 +158,6 @@ class CloudflareChatConfig(BaseConfig):
            message=error_message,
        )

-    def _transform_messages(
-        self, messages: List[AllMessageValues]
-    ) -> List[AllMessageValues]:
-        raise NotImplementedError
-
    def get_model_response_iterator(
        self,
        streaming_response: Union[Iterator[str], AsyncIterator[str], ModelResponse],
--- a/litellm/llms/cohere/chat/transformation.py
+++ b/litellm/llms/cohere/chat/transformation.py
@ -365,8 +365,3 @@ class CohereChatConfig(BaseConfig):
        self, error_message: str, status_code: int, headers: Union[dict, httpx.Headers]
    ) -> BaseLLMException:
        return CohereError(status_code=status_code, message=error_message)
-
-    def _transform_messages(
-        self, messages: List[AllMessageValues]
-    ) -> List[AllMessageValues]:
-        raise NotImplementedError
--- a/litellm/llms/cohere/completion/transformation.py
+++ b/litellm/llms/cohere/completion/transformation.py
@ -121,12 +121,6 @@ class CohereTextConfig(BaseConfig):
            api_key=api_key,
        )

-    def _transform_messages(
-        self,
-        messages: List[AllMessageValues],
-    ) -> List[AllMessageValues]:
-        raise NotImplementedError
-
    def get_error_class(
        self, error_message: str, status_code: int, headers: Union[dict, httpx.Headers]
    ) -> BaseLLMException:
--- a/litellm/llms/databricks/chat/handler.py
+++ b/litellm/llms/databricks/chat/handler.py
@ -2,11 +2,12 @@
 Handles the chat completion request for Databricks
 """

-from typing import Any, Callable, Literal, Optional, Tuple, Union
+from typing import Any, Callable, List, Literal, Optional, Tuple, Union, cast

 from httpx._config import Timeout

 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
+from litellm.types.llms.openai import AllMessageValues
 from litellm.types.utils import CustomStreamingDecoder
 from litellm.utils import ModelResponse

@ -44,7 +45,9 @@ class DatabricksChatCompletion(OpenAILikeChatHandler, DatabricksBase):
        streaming_decoder: Optional[CustomStreamingDecoder] = None,
        fake_stream: bool = False,
    ):
-        messages = DatabricksConfig()._transform_messages(messages)  # type: ignore
+        messages = DatabricksConfig()._transform_messages(
+            messages=cast(List[AllMessageValues], messages), model=model
+        )
        api_base, headers = self.databricks_validate_environment(
            api_base=api_base,
            api_key=api_key,
--- a/litellm/llms/databricks/chat/transformation.py
+++ b/litellm/llms/databricks/chat/transformation.py
@ -7,14 +7,14 @@ from typing import List, Optional, Union

 from pydantic import BaseModel

-from litellm.types.llms.openai import AllMessageValues
-from litellm.types.utils import ProviderField
-
-from ...openai_like.chat.transformation import OpenAILikeChatConfig
 from litellm.litellm_core_utils.prompt_templates.common_utils import (
    handle_messages_with_content_list_to_str_conversion,
    strip_name_from_messages,
 )
+from litellm.types.llms.openai import AllMessageValues
+from litellm.types.utils import ProviderField
+
+from ...openai_like.chat.transformation import OpenAILikeChatConfig


 class DatabricksConfig(OpenAILikeChatConfig):
@ -86,7 +86,7 @@ class DatabricksConfig(OpenAILikeChatConfig):
        return False

    def _transform_messages(
-        self, messages: List[AllMessageValues]
+        self, messages: List[AllMessageValues], model: str
    ) -> List[AllMessageValues]:
        """
        Databricks does not support:
@ -102,4 +102,4 @@ class DatabricksConfig(OpenAILikeChatConfig):
            new_messages.append(_message)
        new_messages = handle_messages_with_content_list_to_str_conversion(new_messages)
        new_messages = strip_name_from_messages(new_messages)
-        return super()._transform_messages(new_messages)
+        return super()._transform_messages(messages=new_messages, model=model)
--- a/litellm/llms/deepseek/chat/transformation.py
+++ b/litellm/llms/deepseek/chat/transformation.py
@ -8,26 +8,26 @@ from typing import List, Optional, Tuple, Union
 from pydantic import BaseModel

 import litellm
+from litellm.litellm_core_utils.prompt_templates.common_utils import (
+    handle_messages_with_content_list_to_str_conversion,
+)
 from litellm.secret_managers.main import get_secret_str
 from litellm.types.llms.openai import AllMessageValues, ChatCompletionAssistantMessage

 from ....utils import _remove_additional_properties, _remove_strict_from_schema
 from ...openai.chat.gpt_transformation import OpenAIGPTConfig
-from litellm.litellm_core_utils.prompt_templates.common_utils import (
-    handle_messages_with_content_list_to_str_conversion,
-)


 class DeepSeekChatConfig(OpenAIGPTConfig):

    def _transform_messages(
-        self, messages: List[AllMessageValues]
+        self, messages: List[AllMessageValues], model: str
    ) -> List[AllMessageValues]:
        """
        DeepSeek does not support content in list format.
        """
        messages = handle_messages_with_content_list_to_str_conversion(messages)
-        return super()._transform_messages(messages)
+        return super()._transform_messages(messages=messages, model=model)

    def _get_openai_compatible_provider_info(
        self, api_base: Optional[str], api_key: Optional[str]
--- a/litellm/llms/groq/chat/handler.py
+++ b/litellm/llms/groq/chat/handler.py
@ -2,11 +2,12 @@
 Handles the chat completion request for groq
 """

-from typing import Any, Callable, Optional, Union
+from typing import Any, Callable, List, Optional, Union, cast

 from httpx._config import Timeout

 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
+from litellm.types.llms.openai import AllMessageValues
 from litellm.types.utils import CustomStreamingDecoder
 from litellm.utils import ModelResponse

@ -42,7 +43,9 @@ class GroqChatCompletion(OpenAILikeChatHandler):
        streaming_decoder: Optional[CustomStreamingDecoder] = None,
        fake_stream: bool = False,
    ):
-        messages = GroqChatConfig()._transform_messages(messages)  # type: ignore
+        messages = GroqChatConfig()._transform_messages(
+            messages=cast(List[AllMessageValues], messages), model=model
+        )

        if optional_params.get("stream") is True:
            fake_stream = GroqChatConfig()._should_fake_stream(optional_params)
--- a/litellm/llms/groq/chat/transformation.py
+++ b/litellm/llms/groq/chat/transformation.py
@ -61,7 +61,7 @@ class GroqChatConfig(OpenAIGPTConfig):
    def get_config(cls):
        return super().get_config()

-    def _transform_messages(self, messages: List[AllMessageValues]) -> List:
+    def _transform_messages(self, messages: List[AllMessageValues], model: str) -> List:
        for idx, message in enumerate(messages):
            """
            1. Don't pass 'null' function_call assistant message to groq - https://github.com/BerriAI/litellm/issues/5839
--- a/litellm/llms/huggingface/chat/transformation.py
+++ b/litellm/llms/huggingface/chat/transformation.py
@ -369,12 +369,6 @@ class HuggingfaceChatConfig(BaseConfig):
        headers = {**headers, **default_headers}
        return headers

-    def _transform_messages(
-        self,
-        messages: List[AllMessageValues],
-    ) -> List[AllMessageValues]:
-        return messages
-
    def get_error_class(
        self, error_message: str, status_code: int, headers: Union[dict, httpx.Headers]
    ) -> BaseLLMException:
--- a/litellm/llms/mistral/mistral_chat_transformation.py
+++ b/litellm/llms/mistral/mistral_chat_transformation.py
@ -9,11 +9,11 @@ Docs - https://docs.mistral.ai/api/
 import types
 from typing import List, Literal, Optional, Tuple, Union

-from litellm.llms.openai.chat.gpt_transformation import OpenAIGPTConfig
 from litellm.litellm_core_utils.prompt_templates.common_utils import (
    handle_messages_with_content_list_to_str_conversion,
    strip_none_values_from_message,
 )
+from litellm.llms.openai.chat.gpt_transformation import OpenAIGPTConfig
 from litellm.secret_managers.main import get_secret_str
 from litellm.types.llms.openai import AllMessageValues

@ -148,7 +148,7 @@ class MistralConfig(OpenAIGPTConfig):
        return api_base, dynamic_api_key

    def _transform_messages(
-        self, messages: List[AllMessageValues]
+        self, messages: List[AllMessageValues], model: str
    ) -> List[AllMessageValues]:
        """
        - handles scenario where content is list and not string
--- a/litellm/llms/ollama/completion/transformation.py
+++ b/litellm/llms/ollama/completion/transformation.py
@ -23,6 +23,7 @@ from litellm.types.llms.openai import (
 from litellm.types.utils import (
    GenericStreamingChunk,
    ModelInfo,
+    ModelInfoBase,
    ModelResponse,
    ProviderField,
    StreamingChoices,
@ -198,7 +199,7 @@ class OllamaConfig(BaseConfig):
                return v
        return None

-    def get_model_info(self, model: str) -> ModelInfo:
+    def get_model_info(self, model: str) -> ModelInfoBase:
        """
        curl http://localhost:11434/api/show -d '{
          "name": "mistral"
@ -222,11 +223,10 @@ class OllamaConfig(BaseConfig):

        _max_tokens: Optional[int] = self._get_max_tokens(model_info)

-        return ModelInfo(
+        return ModelInfoBase(
            key=model,
            litellm_provider="ollama",
            mode="chat",
-            supported_openai_params=self.get_supported_openai_params(model=model),
            supports_function_calling=self._supports_function_calling(model_info),
            input_cost_per_token=0.0,
            output_cost_per_token=0.0,
@ -235,11 +235,6 @@ class OllamaConfig(BaseConfig):
            max_output_tokens=_max_tokens,
        )

-    def _transform_messages(
-        self, messages: List[AllMessageValues]
-    ) -> List[AllMessageValues]:
-        return messages
-
    def get_error_class(
        self, error_message: str, status_code: int, headers: Union[dict, Headers]
    ) -> BaseLLMException:
--- a/litellm/llms/oobabooga/chat/transformation.py
+++ b/litellm/llms/oobabooga/chat/transformation.py
@ -23,11 +23,6 @@ else:


 class OobaboogaConfig(OpenAIGPTConfig):
-    def _transform_messages(
-        self, messages: List[AllMessageValues]
-    ) -> List[AllMessageValues]:
-        return messages
-
    def get_error_class(
        self,
        error_message: str,
--- a/litellm/llms/openai/chat/gpt_transformation.py
+++ b/litellm/llms/openai/chat/gpt_transformation.py
@ -164,7 +164,7 @@ class OpenAIGPTConfig(BaseConfig):
        )

    def _transform_messages(
-        self, messages: List[AllMessageValues]
+        self, messages: List[AllMessageValues], model: str
    ) -> List[AllMessageValues]:
        return messages

--- a/litellm/llms/openai/chat/o1_transformation.py
+++ b/litellm/llms/openai/chat/o1_transformation.py
@ -15,7 +15,14 @@ import types
 from typing import Any, List, Optional, Union

 import litellm
+from litellm import verbose_logger
+from litellm.litellm_core_utils.get_llm_provider_logic import get_llm_provider
 from litellm.types.llms.openai import AllMessageValues, ChatCompletionUserMessage
+from litellm.utils import (
+    supports_function_calling,
+    supports_response_schema,
+    supports_system_messages,
+)

 from .gpt_transformation import OpenAIGPTConfig

@ -29,6 +36,15 @@ class OpenAIO1Config(OpenAIGPTConfig):
    def get_config(cls):
        return super().get_config()

+    def should_fake_stream(
+        self, model: str, custom_llm_provider: Optional[str] = None
+    ) -> bool:
+        supported_stream_models = ["o1-mini", "o1-preview"]
+        for supported_model in supported_stream_models:
+            if supported_model in model:
+                return False
+        return True
+
    def get_supported_openai_params(self, model: str) -> list:
        """
        Get the supported OpenAI params for the given model
@ -38,21 +54,37 @@ class OpenAIO1Config(OpenAIGPTConfig):
        all_openai_params = super().get_supported_openai_params(model=model)
        non_supported_params = [
            "logprobs",
-            "tools",
-            "tool_choice",
-            "parallel_tool_calls",
-            "function_call",
-            "functions",
            "top_p",
-            "n",
            "presence_penalty",
            "frequency_penalty",
            "top_logprobs",
-            "response_format",
-            "stop",
-            "stream_options",
        ]

+        try:
+            model, custom_llm_provider, api_base, api_key = get_llm_provider(
+                model=model
+            )
+        except Exception:
+            verbose_logger.debug(
+                f"Unable to infer model provider for model={model}, defaulting to openai for o1 supported param check"
+            )
+            custom_llm_provider = "openai"
+
+        _supports_function_calling = supports_function_calling(
+            model, custom_llm_provider
+        )
+        _supports_response_schema = supports_response_schema(model, custom_llm_provider)
+
+        if not _supports_function_calling:
+            non_supported_params.append("tools")
+            non_supported_params.append("tool_choice")
+            non_supported_params.append("parallel_tool_calls")
+            non_supported_params.append("function_call")
+            non_supported_params.append("functions")
+
+        if not _supports_response_schema:
+            non_supported_params.append("response_format")
+
        return [
            param for param in all_openai_params if param not in non_supported_params
        ]
@ -95,16 +127,16 @@ class OpenAIO1Config(OpenAIGPTConfig):
        return False

    def _transform_messages(
-        self, messages: List[AllMessageValues]
+        self, messages: List[AllMessageValues], model: str
    ) -> List[AllMessageValues]:
        """
        Handles limitations of O-1 model family.
        - modalities: image => drop param (if user opts in to dropping param)
        - role: system ==> translate to role 'user'
        """
-
+        _supports_system_messages = supports_system_messages(model, "openai")
        for i, message in enumerate(messages):
-            if message["role"] == "system":
+            if message["role"] == "system" and not _supports_system_messages:
                new_message = ChatCompletionUserMessage(
                    content=message["content"], role="user"
                )
--- a/litellm/llms/openai/openai.py
+++ b/litellm/llms/openai/openai.py
@ -33,6 +33,7 @@ from litellm.litellm_core_utils.prompt_templates.factory import (
    prompt_factory,
 )
 from litellm.llms.base_llm.chat.transformation import BaseConfig, BaseLLMException
+from litellm.llms.bedrock.chat.invoke_handler import MockResponseIterator
 from litellm.llms.custom_httpx.http_handler import _DEFAULT_TTL_FOR_HTTPX_CLIENTS
 from litellm.secret_managers.main import get_secret_str
 from litellm.types.utils import (
@ -198,7 +199,7 @@ class OpenAIConfig(BaseConfig):
        return optional_params

    def _transform_messages(
-        self, messages: List[AllMessageValues]
+        self, messages: List[AllMessageValues], model: str
    ) -> List[AllMessageValues]:
        return messages

@ -410,6 +411,24 @@ class OpenAIChatCompletion(BaseLLM):
            else:
                raise e

+    def mock_streaming(
+        self,
+        response: ModelResponse,
+        logging_obj: LiteLLMLoggingObj,
+        model: str,
+        stream_options: Optional[dict] = None,
+    ) -> CustomStreamWrapper:
+        completion_stream = MockResponseIterator(model_response=response)
+        streaming_response = CustomStreamWrapper(
+            completion_stream=completion_stream,
+            model=model,
+            custom_llm_provider="openai",
+            logging_obj=logging_obj,
+            stream_options=stream_options,
+        )
+
+        return streaming_response
+
    def completion(  # type: ignore # noqa: PLR0915
        self,
        model_response: ModelResponse,
@ -433,8 +452,21 @@ class OpenAIChatCompletion(BaseLLM):
    ):
        super().completion()
        try:
+            fake_stream: bool = False
+            if custom_llm_provider is not None and model is not None:
+                provider_config = ProviderConfigManager.get_provider_chat_config(
+                    model=model, provider=LlmProviders(custom_llm_provider)
+                )
+                fake_stream = provider_config.should_fake_stream(
+                    model=model, custom_llm_provider=custom_llm_provider
+                )
+            inference_params = optional_params.copy()
+            stream_options: Optional[dict] = inference_params.pop(
+                "stream_options", None
+            )
+            stream: Optional[bool] = inference_params.pop("stream", False)
            if headers:
-                optional_params["extra_headers"] = headers
+                inference_params["extra_headers"] = headers
            if model is None or messages is None:
                raise OpenAIError(status_code=422, message="Missing model or messages")

@ -456,7 +488,9 @@ class OpenAIChatCompletion(BaseLLM):
                if isinstance(provider_config, OpenAIGPTConfig) or isinstance(
                    provider_config, OpenAIConfig
                ):
-                    messages = provider_config._transform_messages(messages)
+                    messages = provider_config._transform_messages(
+                        messages=messages, model=model
+                    )

            for _ in range(
                2
@ -464,7 +498,7 @@ class OpenAIChatCompletion(BaseLLM):
                data = OpenAIConfig().transform_request(
                    model=model,
                    messages=messages,
-                    optional_params=optional_params,
+                    optional_params=inference_params,
                    litellm_params=litellm_params,
                    headers=headers or {},
                )
@ -472,7 +506,7 @@ class OpenAIChatCompletion(BaseLLM):
                try:
                    max_retries = data.pop("max_retries", 2)
                    if acompletion is True:
-                        if optional_params.get("stream", False):
+                        if stream is True and fake_stream is False:
                            return self.async_streaming(
                                logging_obj=logging_obj,
                                headers=headers,
@ -485,11 +519,13 @@ class OpenAIChatCompletion(BaseLLM):
                                max_retries=max_retries,
                                organization=organization,
                                drop_params=drop_params,
+                                stream_options=stream_options,
                            )
                        else:
                            return self.acompletion(
                                data=data,
                                headers=headers,
+                                model=model,
                                logging_obj=logging_obj,
                                model_response=model_response,
                                api_base=api_base,
@ -499,8 +535,9 @@ class OpenAIChatCompletion(BaseLLM):
                                max_retries=max_retries,
                                organization=organization,
                                drop_params=drop_params,
+                                fake_stream=fake_stream,
                            )
-                    elif optional_params.get("stream", False):
+                    elif stream is True and fake_stream is False:
                        return self.streaming(
                            logging_obj=logging_obj,
                            headers=headers,
@ -512,6 +549,7 @@ class OpenAIChatCompletion(BaseLLM):
                            client=client,
                            max_retries=max_retries,
                            organization=organization,
+                            stream_options=stream_options,
                        )
                    else:
                        if not isinstance(max_retries, int):
@ -557,16 +595,26 @@ class OpenAIChatCompletion(BaseLLM):
                            original_response=stringified_response,
                            additional_args={"complete_input_dict": data},
                        )
-                        return convert_to_model_response_object(
+
+                        final_response_obj = convert_to_model_response_object(
                            response_object=stringified_response,
                            model_response_object=model_response,
                            _response_headers=headers,
                        )
+                        if fake_stream is True:
+                            return self.mock_streaming(
+                                response=cast(ModelResponse, final_response_obj),
+                                logging_obj=logging_obj,
+                                model=model,
+                                stream_options=stream_options,
+                            )
+
+                        return final_response_obj
                except openai.UnprocessableEntityError as e:
                    ## check if body contains unprocessable params - related issue https://github.com/BerriAI/litellm/issues/4800
                    if litellm.drop_params is True or drop_params is True:
-                        optional_params = drop_params_from_unprocessable_entity_error(
-                            e, optional_params
+                        inference_params = drop_params_from_unprocessable_entity_error(
+                            e, inference_params
                        )
                    else:
                        raise e
@ -623,6 +671,7 @@ class OpenAIChatCompletion(BaseLLM):
    async def acompletion(
        self,
        data: dict,
+        model: str,
        model_response: ModelResponse,
        logging_obj: LiteLLMLoggingObj,
        timeout: Union[float, httpx.Timeout],
@ -633,6 +682,8 @@ class OpenAIChatCompletion(BaseLLM):
        max_retries=None,
        headers=None,
        drop_params: Optional[bool] = None,
+        stream_options: Optional[dict] = None,
+        fake_stream: bool = False,
    ):
        response = None
        for _ in range(
@ -667,6 +718,7 @@ class OpenAIChatCompletion(BaseLLM):
                    openai_aclient=openai_aclient, data=data, timeout=timeout
                )
                stringified_response = response.model_dump()
+
                logging_obj.post_call(
                    input=data["messages"],
                    api_key=api_key,
@ -674,12 +726,22 @@ class OpenAIChatCompletion(BaseLLM):
                    additional_args={"complete_input_dict": data},
                )
                logging_obj.model_call_details["response_headers"] = headers
-                return convert_to_model_response_object(
+                final_response_obj = convert_to_model_response_object(
                    response_object=stringified_response,
                    model_response_object=model_response,
                    hidden_params={"headers": headers},
                    _response_headers=headers,
                )
+
+                if fake_stream is True:
+                    return self.mock_streaming(
+                        response=cast(ModelResponse, final_response_obj),
+                        logging_obj=logging_obj,
+                        model=model,
+                        stream_options=stream_options,
+                    )
+
+                return final_response_obj
            except openai.UnprocessableEntityError as e:
                ## check if body contains unprocessable params - related issue https://github.com/BerriAI/litellm/issues/4800
                if litellm.drop_params is True or drop_params is True:
@ -710,7 +772,11 @@ class OpenAIChatCompletion(BaseLLM):
        client=None,
        max_retries=None,
        headers=None,
+        stream_options: Optional[dict] = None,
    ):
+        data["stream"] = True
+        if stream_options is not None:
+            data["stream_options"] = stream_options
        openai_client: OpenAI = self._get_openai_client(  # type: ignore
            is_async=False,
            api_key=api_key,
@ -761,8 +827,12 @@ class OpenAIChatCompletion(BaseLLM):
        max_retries=None,
        headers=None,
        drop_params: Optional[bool] = None,
+        stream_options: Optional[dict] = None,
    ):
        response = None
+        data["stream"] = True
+        if stream_options is not None:
+            data["stream_options"] = stream_options
        for _ in range(2):
            try:
                openai_aclient: AsyncOpenAI = self._get_openai_client(  # type: ignore
--- a/litellm/llms/openai_like/chat/handler.py
+++ b/litellm/llms/openai_like/chat/handler.py
@ -284,7 +284,9 @@ class OpenAILikeChatHandler(OpenAILikeBase):
            if isinstance(provider_config, OpenAIGPTConfig) or isinstance(
                provider_config, OpenAIConfig
            ):
-                messages = provider_config._transform_messages(messages)
+                messages = provider_config._transform_messages(
+                    messages=messages, model=model
+                )

        data = {
            "model": model,
--- a/litellm/llms/predibase/chat/transformation.py
+++ b/litellm/llms/predibase/chat/transformation.py
@ -139,11 +139,6 @@ class PredibaseConfig(BaseConfig):
            "Predibase transformation currently done in handler.py. Need to migrate to this file."
        )

-    def _transform_messages(
-        self, messages: List[AllMessageValues]
-    ) -> List[AllMessageValues]:
-        return messages
-
    def transform_request(
        self,
        model: str,
--- a/litellm/llms/replicate/chat/transformation.py
+++ b/litellm/llms/replicate/chat/transformation.py
@ -130,11 +130,6 @@ class ReplicateConfig(BaseConfig):
            return split_model[1]
        return model

-    def _transform_messages(
-        self, messages: List[AllMessageValues]
-    ) -> List[AllMessageValues]:
-        return messages
-
    def get_error_class(
        self, error_message: str, status_code: int, headers: Union[dict, httpx.Headers]
    ) -> BaseLLMException:
--- a/litellm/llms/sagemaker/completion/transformation.py
+++ b/litellm/llms/sagemaker/completion/transformation.py
@ -57,12 +57,6 @@ class SagemakerConfig(BaseConfig):
    def get_config(cls):
        return super().get_config()

-    def _transform_messages(
-        self,
-        messages: List[AllMessageValues],
-    ) -> List[AllMessageValues]:
-        return messages
-
    def get_error_class(
        self, error_message: str, status_code: int, headers: Union[dict, Headers]
    ) -> BaseLLMException:
--- a/litellm/llms/watsonx/completion/transformation.py
+++ b/litellm/llms/watsonx/completion/transformation.py
@ -240,12 +240,6 @@ class IBMWatsonXAIConfig(BaseConfig):
            "us-south",
        ]

-    def _transform_messages(
-        self,
-        messages: List[AllMessageValues],
-    ) -> List[AllMessageValues]:
-        return messages
-
    def get_error_class(
        self, error_message: str, status_code: int, headers: Union[Dict, httpx.Headers]
    ) -> BaseLLMException:
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -13,7 +13,8 @@
        "supports_audio_input": true, 
        "supports_audio_output": true,
        "supports_prompt_caching": true,
-        "supports_response_schema": true
+        "supports_response_schema": true,
+        "supports_system_messages": true
    },
    "sambanova/Meta-Llama-3.1-8B-Instruct": {
        "max_tokens": 16000,
@ -94,7 +95,8 @@
        "litellm_provider": "openai",
        "mode": "chat",
        "supports_function_calling": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-4o": {
        "max_tokens": 16384,
@ -109,7 +111,8 @@
        "supports_parallel_function_calling": true,
        "supports_response_schema": true,
        "supports_vision": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-4o-audio-preview": {
        "max_tokens": 16384,
@ -124,7 +127,8 @@
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
        "supports_audio_input": true,
-        "supports_audio_output": true
+        "supports_audio_output": true,
+        "supports_system_messages": true
    },
    "gpt-4o-audio-preview-2024-10-01": {
        "max_tokens": 16384,
@ -139,7 +143,8 @@
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
        "supports_audio_input": true,
-        "supports_audio_output": true
+        "supports_audio_output": true,
+        "supports_system_messages": true
    },
    "gpt-4o-mini-audio-preview-2024-12-17": {
        "max_tokens": 16384,
@ -154,7 +159,8 @@
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
        "supports_audio_input": true,
-        "supports_audio_output": true
+        "supports_audio_output": true,
+        "supports_system_messages": true
    },
    "gpt-4o-mini": {
        "max_tokens": 16384,
@ -169,7 +175,8 @@
        "supports_parallel_function_calling": true,
        "supports_response_schema": true,
        "supports_vision": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-4o-mini-2024-07-18": {
        "max_tokens": 16384,
@ -184,7 +191,8 @@
        "supports_parallel_function_calling": true,
        "supports_response_schema": true,
        "supports_vision": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "o1": {
        "max_tokens": 100000,
@ -198,7 +206,9 @@
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
        "supports_vision": false,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true,
+        "supports_response_schema": true
    },
    "o1-mini": {
        "max_tokens": 65536,
@ -209,8 +219,6 @@
        "cache_read_input_token_cost": 0.0000015,
        "litellm_provider": "openai",
        "mode": "chat",
-        "supports_function_calling": true,
-        "supports_parallel_function_calling": true,
        "supports_vision": false,
        "supports_prompt_caching": true
    },
@ -223,8 +231,6 @@
        "cache_read_input_token_cost": 0.0000015,
        "litellm_provider": "openai",
        "mode": "chat",
-        "supports_function_calling": true,
-        "supports_parallel_function_calling": true,
        "supports_vision": false,
        "supports_prompt_caching": true
    },
@ -237,8 +243,6 @@
        "cache_read_input_token_cost": 0.0000075,
        "litellm_provider": "openai",
        "mode": "chat",
-        "supports_function_calling": true,
-        "supports_parallel_function_calling": true,
        "supports_vision": false,
        "supports_prompt_caching": true
    },
@ -251,8 +255,6 @@
        "cache_read_input_token_cost": 0.0000075,
        "litellm_provider": "openai",
        "mode": "chat",
-        "supports_function_calling": true,
-        "supports_parallel_function_calling": true,
        "supports_vision": false,
        "supports_prompt_caching": true
    },
@ -268,7 +270,9 @@
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
        "supports_vision": false,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true,
+        "supports_response_schema": true
    },
    "chatgpt-4o-latest": {
        "max_tokens": 4096,
@ -281,7 +285,8 @@
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
        "supports_vision": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-4o-2024-05-13": {
        "max_tokens": 4096,
@ -294,7 +299,8 @@
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
        "supports_vision": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-4o-2024-08-06": {
        "max_tokens": 16384,
@ -309,7 +315,8 @@
        "supports_parallel_function_calling": true,
        "supports_response_schema": true,
        "supports_vision": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-4o-2024-11-20": {
        "max_tokens": 16384,
@ -324,7 +331,8 @@
        "supports_parallel_function_calling": true,
        "supports_response_schema": true,
        "supports_vision": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-4o-realtime-preview-2024-10-01": {
        "max_tokens": 4096,
@ -341,7 +349,8 @@
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
        "supports_audio_input": true,
-        "supports_audio_output": true
+        "supports_audio_output": true,
+        "supports_system_messages": true
    },
    "gpt-4o-realtime-preview": {
        "max_tokens": 4096,
@ -357,7 +366,8 @@
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
        "supports_audio_input": true,
-        "supports_audio_output": true
+        "supports_audio_output": true,
+        "supports_system_messages": true
    },
    "gpt-4o-realtime-preview-2024-12-17": {
        "max_tokens": 4096,
@ -373,7 +383,8 @@
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
        "supports_audio_input": true,
-        "supports_audio_output": true
+        "supports_audio_output": true,
+        "supports_system_messages": true
    },
    "gpt-4o-mini-realtime-preview": {
        "max_tokens": 4096,
@ -390,7 +401,8 @@
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
        "supports_audio_input": true,
-        "supports_audio_output": true
+        "supports_audio_output": true,
+        "supports_system_messages": true
    },
    "gpt-4o-mini-realtime-preview-2024-12-17": {
        "max_tokens": 4096,
@ -407,7 +419,8 @@
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
        "supports_audio_input": true,
-        "supports_audio_output": true
+        "supports_audio_output": true,
+        "supports_system_messages": true
    },
    "gpt-4-turbo-preview": {
        "max_tokens": 4096,
@ -419,7 +432,8 @@
        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-4-0314": {
        "max_tokens": 4096,
@ -429,7 +443,8 @@
        "output_cost_per_token": 0.00006,
        "litellm_provider": "openai",
        "mode": "chat",
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-4-0613": {
        "max_tokens": 4096,
@ -440,7 +455,8 @@
        "litellm_provider": "openai",
        "mode": "chat",
        "supports_function_calling": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-4-32k": {
        "max_tokens": 4096,
@ -450,7 +466,8 @@
        "output_cost_per_token": 0.00012,
        "litellm_provider": "openai",
        "mode": "chat",
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-4-32k-0314": {
        "max_tokens": 4096,
@ -460,7 +477,8 @@
        "output_cost_per_token": 0.00012,
        "litellm_provider": "openai",
        "mode": "chat",
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-4-32k-0613": {
        "max_tokens": 4096,
@ -470,7 +488,8 @@
        "output_cost_per_token": 0.00012,
        "litellm_provider": "openai",
        "mode": "chat",
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-4-turbo": {
        "max_tokens": 4096,
@ -483,7 +502,8 @@
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
        "supports_vision": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-4-turbo-2024-04-09": {
        "max_tokens": 4096,
@ -496,7 +516,8 @@
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
        "supports_vision": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-4-1106-preview": {
        "max_tokens": 4096,
@ -508,7 +529,8 @@
        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-4-0125-preview": {
        "max_tokens": 4096,
@ -520,7 +542,8 @@
        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-4-vision-preview": {
        "max_tokens": 4096,
@ -531,7 +554,8 @@
        "litellm_provider": "openai",
        "mode": "chat",
        "supports_vision": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-4-1106-vision-preview": {
        "max_tokens": 4096,
@ -542,7 +566,8 @@
        "litellm_provider": "openai",
        "mode": "chat",
        "supports_vision": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-3.5-turbo": {
        "max_tokens": 4097,
@ -553,7 +578,8 @@
        "litellm_provider": "openai",
        "mode": "chat",
        "supports_function_calling": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-3.5-turbo-0301": {
        "max_tokens": 4097,
@ -563,7 +589,8 @@
        "output_cost_per_token": 0.000002,
        "litellm_provider": "openai",
        "mode": "chat",
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-3.5-turbo-0613": {
        "max_tokens": 4097,
@ -574,7 +601,8 @@
        "litellm_provider": "openai",
        "mode": "chat",
        "supports_function_calling": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-3.5-turbo-1106": {
        "max_tokens": 16385,
@ -586,7 +614,8 @@
        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-3.5-turbo-0125": {
        "max_tokens": 16385,
@ -598,7 +627,8 @@
        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-3.5-turbo-16k": {
        "max_tokens": 16385,
@ -608,7 +638,8 @@
        "output_cost_per_token": 0.000004,
        "litellm_provider": "openai",
        "mode": "chat",
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-3.5-turbo-16k-0613": {
        "max_tokens": 16385,
@ -618,7 +649,8 @@
        "output_cost_per_token": 0.000004,
        "litellm_provider": "openai",
        "mode": "chat",
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "ft:gpt-3.5-turbo": {
        "max_tokens": 4096,
@ -627,7 +659,8 @@
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000006,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_system_messages": true
    },
    "ft:gpt-3.5-turbo-0125": {
        "max_tokens": 4096,
@ -636,7 +669,8 @@
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000006,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_system_messages": true
    },
    "ft:gpt-3.5-turbo-1106": {
        "max_tokens": 4096,
@ -645,7 +679,8 @@
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000006,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_system_messages": true
    },
    "ft:gpt-3.5-turbo-0613": {
        "max_tokens": 4096,
@ -654,7 +689,8 @@
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000006,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_system_messages": true
    },
    "ft:gpt-4-0613": {
        "max_tokens": 4096,
@ -665,7 +701,8 @@
        "litellm_provider": "openai",
        "mode": "chat",
        "supports_function_calling": true,
-        "source": "OpenAI needs to add pricing for this ft model, will be updated when added by OpenAI. Defaulting to base model pricing"
+        "source": "OpenAI needs to add pricing for this ft model, will be updated when added by OpenAI. Defaulting to base model pricing",
+        "supports_system_messages": true
    },
    "ft:gpt-4o-2024-08-06": {
        "max_tokens": 16384,
@ -678,7 +715,8 @@
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
        "supports_response_schema": true,
-        "supports_vision": true
+        "supports_vision": true,
+        "supports_system_messages": true
    },
    "ft:gpt-4o-2024-11-20": {
        "max_tokens": 16384,
@ -693,7 +731,8 @@
        "supports_parallel_function_calling": true,
        "supports_response_schema": true,
        "supports_vision": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "ft:gpt-4o-mini-2024-07-18": {
        "max_tokens": 16384,
@ -708,7 +747,8 @@
        "supports_parallel_function_calling": true,
        "supports_response_schema": true,
        "supports_vision": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "ft:davinci-002": {
        "max_tokens": 16384,
@ -3166,6 +3206,42 @@
        "supports_function_calling": true,
        "supports_vision": true,
        "supports_response_schema": true,
+        "supports_audio_output": true,
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-2.0-flash"
+    },
+    "gemini/gemini-2.0-flash-exp": {
+        "max_tokens": 8192,
+        "max_input_tokens": 1048576,
+        "max_output_tokens": 8192,
+        "max_images_per_prompt": 3000,
+        "max_videos_per_prompt": 10,
+        "max_video_length": 1,
+        "max_audio_length_hours": 8.4,
+        "max_audio_per_prompt": 1,
+        "max_pdf_size_mb": 30,
+        "input_cost_per_image": 0,
+        "input_cost_per_video_per_second": 0,
+        "input_cost_per_audio_per_second": 0,
+        "input_cost_per_token": 0,
+        "input_cost_per_character": 0, 
+        "input_cost_per_token_above_128k_tokens": 0, 
+        "input_cost_per_character_above_128k_tokens": 0, 
+        "input_cost_per_image_above_128k_tokens": 0,
+        "input_cost_per_video_per_second_above_128k_tokens": 0,
+        "input_cost_per_audio_per_second_above_128k_tokens": 0,
+        "output_cost_per_token": 0,
+        "output_cost_per_character": 0,
+        "output_cost_per_token_above_128k_tokens": 0,
+        "output_cost_per_character_above_128k_tokens": 0,
+        "litellm_provider": "gemini",
+        "mode": "chat",
+        "supports_system_messages": true,
+        "supports_function_calling": true,
+        "supports_vision": true,
+        "supports_response_schema": true,
+        "supports_audio_output": true,
+        "tpm": 4000000,
+        "rpm": 10,
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-2.0-flash"
    },
    "vertex_ai/claude-3-sonnet": {
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@ -74,11 +74,7 @@ class ProviderField(TypedDict):
    field_value: str


-class ModelInfo(TypedDict, total=False):
-    """
-    Model info for a given model, this is information found in litellm.model_prices_and_context_window.json
-    """
-
+class ModelInfoBase(TypedDict, total=False):
    key: Required[str]  # the key in litellm.model_cost which is returned

    max_tokens: Required[Optional[int]]
@ -119,7 +115,6 @@ class ModelInfo(TypedDict, total=False):
            "completion", "embedding", "image_generation", "chat", "audio_transcription"
        ]
    ]
-    supported_openai_params: Required[Optional[List[str]]]
    supports_system_messages: Optional[bool]
    supports_response_schema: Optional[bool]
    supports_vision: Optional[bool]
@ -133,6 +128,14 @@ class ModelInfo(TypedDict, total=False):
    rpm: Optional[int]


+class ModelInfo(ModelInfoBase, total=False):
+    """
+    Model info for a given model, this is information found in litellm.model_prices_and_context_window.json
+    """
+
+    supported_openai_params: Required[Optional[List[str]]]
+
+
 class GenericStreamingChunk(TypedDict, total=False):
    text: Required[str]
    tool_use: Optional[ChatCompletionToolCallChunk]
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -132,6 +132,7 @@ from litellm.types.utils import (
    LlmProviders,
    Message,
    ModelInfo,
+    ModelInfoBase,
    ModelResponse,
    ModelResponseStream,
    ProviderField,
@ -1645,16 +1646,10 @@ def supports_system_messages(model: str, custom_llm_provider: Optional[str]) ->
    Raises:
    Exception: If the given model is not found in model_prices_and_context_window.json.
    """
-    try:
-        model_info = litellm.get_model_info(
-            model=model, custom_llm_provider=custom_llm_provider
-        )
-        if model_info.get("supports_system_messages", False) is True:
-            return True
-        return False
-    except Exception:
-        raise Exception(
-            f"Model not supports system messages. You passed model={model}, custom_llm_provider={custom_llm_provider}."
+    return _supports_factory(
+        model=model,
+        custom_llm_provider=custom_llm_provider,
+        key="supports_system_messages",
    )


@ -1684,25 +1679,11 @@ def supports_response_schema(model: str, custom_llm_provider: Optional[str]) ->

    if custom_llm_provider in PROVIDERS_GLOBALLY_SUPPORT_RESPONSE_SCHEMA:
        return True
-    try:
-        ## GET MODEL INFO
-        model_info = litellm.get_model_info(
-            model=model, custom_llm_provider=custom_llm_provider
-        )
-
-        if model_info.get("supports_response_schema", False) is True:
-            return True
-    except Exception:
-        ## check if provider supports response schema globally
-        supported_params = get_supported_openai_params(
+    return _supports_factory(
        model=model,
        custom_llm_provider=custom_llm_provider,
-            request_type="chat_completion",
+        key="supports_response_schema",
    )
-        if supported_params is not None and "response_schema" in supported_params:
-            return True
-
-    return False


 def supports_function_calling(
@ -1721,22 +1702,10 @@ def supports_function_calling(
    Raises:
    Exception: If the given model is not found or there's an error in retrieval.
    """
-    try:
-        model, custom_llm_provider, _, _ = litellm.get_llm_provider(
-            model=model, custom_llm_provider=custom_llm_provider
-        )
-
-        ## CHECK IF MODEL SUPPORTS FUNCTION CALLING ##
-        model_info = litellm.get_model_info(
-            model=model, custom_llm_provider=custom_llm_provider
-        )
-
-        if model_info.get("supports_function_calling", False) is True:
-            return True
-        return False
-    except Exception as e:
-        raise Exception(
-            f"Model not found or error in checking function calling support. You passed model={model}, custom_llm_provider={custom_llm_provider}. Error: {str(e)}"
+    return _supports_factory(
+        model=model,
+        custom_llm_provider=custom_llm_provider,
+        key="supports_function_calling",
    )


@ -1759,7 +1728,7 @@ def _supports_factory(model: str, custom_llm_provider: Optional[str], key: str)
            model=model, custom_llm_provider=custom_llm_provider
        )

-        model_info = litellm.get_model_info(
+        model_info = _get_model_info_helper(
            model=model, custom_llm_provider=custom_llm_provider
        )

@ -1767,9 +1736,10 @@ def _supports_factory(model: str, custom_llm_provider: Optional[str], key: str)
            return True
        return False
    except Exception as e:
-        raise Exception(
+        verbose_logger.debug(
            f"Model not found or error in checking {key} support. You passed model={model}, custom_llm_provider={custom_llm_provider}. Error: {str(e)}"
        )
+        return False


 def supports_audio_input(model: str, custom_llm_provider: Optional[str] = None) -> bool:
@ -4196,9 +4166,239 @@ def _get_potential_model_names(
    )


-def get_model_info(  # noqa: PLR0915
+def _get_max_position_embeddings(model_name: str) -> Optional[int]:
+    # Construct the URL for the config.json file
+    config_url = f"https://huggingface.co/{model_name}/raw/main/config.json"
+
+    try:
+        # Make the HTTP request to get the raw JSON file
+        response = litellm.module_level_client.get(config_url)
+        response.raise_for_status()  # Raise an exception for bad responses (4xx or 5xx)
+
+        # Parse the JSON response
+        config_json = response.json()
+
+        # Extract and return the max_position_embeddings
+        max_position_embeddings = config_json.get("max_position_embeddings")
+
+        if max_position_embeddings is not None:
+            return max_position_embeddings
+        else:
+            return None
+    except Exception:
+        return None
+
+
+def _get_model_info_helper(  # noqa: PLR0915
    model: str, custom_llm_provider: Optional[str] = None
-) -> ModelInfo:
+) -> ModelInfoBase:
+    """
+    Helper for 'get_model_info'. Separated out to avoid infinite loop caused by returning 'supported_openai_param's
+    """
+    try:
+        azure_llms = {**litellm.azure_llms, **litellm.azure_embedding_models}
+        if model in azure_llms:
+            model = azure_llms[model]
+        if custom_llm_provider is not None and custom_llm_provider == "vertex_ai_beta":
+            custom_llm_provider = "vertex_ai"
+        if custom_llm_provider is not None and custom_llm_provider == "vertex_ai":
+            if "meta/" + model in litellm.vertex_llama3_models:
+                model = "meta/" + model
+            elif model + "@latest" in litellm.vertex_mistral_models:
+                model = model + "@latest"
+            elif model + "@latest" in litellm.vertex_ai_ai21_models:
+                model = model + "@latest"
+        ##########################
+        potential_model_names = _get_potential_model_names(
+            model=model, custom_llm_provider=custom_llm_provider
+        )
+        combined_model_name = potential_model_names["combined_model_name"]
+        stripped_model_name = potential_model_names["stripped_model_name"]
+        combined_stripped_model_name = potential_model_names[
+            "combined_stripped_model_name"
+        ]
+        split_model = potential_model_names["split_model"]
+        custom_llm_provider = potential_model_names["custom_llm_provider"]
+        #########################
+        if custom_llm_provider == "huggingface":
+            max_tokens = _get_max_position_embeddings(model_name=model)
+            return ModelInfoBase(
+                key=model,
+                max_tokens=max_tokens,  # type: ignore
+                max_input_tokens=None,
+                max_output_tokens=None,
+                input_cost_per_token=0,
+                output_cost_per_token=0,
+                litellm_provider="huggingface",
+                mode="chat",
+                supports_system_messages=None,
+                supports_response_schema=None,
+                supports_function_calling=None,
+                supports_assistant_prefill=None,
+                supports_prompt_caching=None,
+                supports_pdf_input=None,
+            )
+        elif custom_llm_provider == "ollama" or custom_llm_provider == "ollama_chat":
+            return litellm.OllamaConfig().get_model_info(model)
+        else:
+            """
+            Check if: (in order of specificity)
+            1. 'custom_llm_provider/model' in litellm.model_cost. Checks "groq/llama3-8b-8192" if model="llama3-8b-8192" and custom_llm_provider="groq"
+            2. 'model' in litellm.model_cost. Checks "gemini-1.5-pro-002" in  litellm.model_cost if model="gemini-1.5-pro-002" and custom_llm_provider=None
+            3. 'combined_stripped_model_name' in litellm.model_cost. Checks if 'gemini/gemini-1.5-flash' in model map, if 'gemini/gemini-1.5-flash-001' given.
+            4. 'stripped_model_name' in litellm.model_cost. Checks if 'ft:gpt-3.5-turbo' in model map, if 'ft:gpt-3.5-turbo:my-org:custom_suffix:id' given.
+            5. 'split_model' in litellm.model_cost. Checks "llama3-8b-8192" in litellm.model_cost if model="groq/llama3-8b-8192"
+            """
+
+            _model_info: Optional[Dict[str, Any]] = None
+            key: Optional[str] = None
+            if combined_model_name in litellm.model_cost:
+                key = combined_model_name
+                _model_info = _get_model_info_from_model_cost(key=key)
+                if not _check_provider_match(
+                    model_info=_model_info, custom_llm_provider=custom_llm_provider
+                ):
+                    _model_info = None
+            if _model_info is None and model in litellm.model_cost:
+                key = model
+                _model_info = _get_model_info_from_model_cost(key=key)
+                if not _check_provider_match(
+                    model_info=_model_info, custom_llm_provider=custom_llm_provider
+                ):
+                    _model_info = None
+            if (
+                _model_info is None
+                and combined_stripped_model_name in litellm.model_cost
+            ):
+                key = combined_stripped_model_name
+                _model_info = _get_model_info_from_model_cost(key=key)
+                if not _check_provider_match(
+                    model_info=_model_info, custom_llm_provider=custom_llm_provider
+                ):
+                    _model_info = None
+            if _model_info is None and stripped_model_name in litellm.model_cost:
+                key = stripped_model_name
+                _model_info = _get_model_info_from_model_cost(key=key)
+                if not _check_provider_match(
+                    model_info=_model_info, custom_llm_provider=custom_llm_provider
+                ):
+                    _model_info = None
+            if _model_info is None and split_model in litellm.model_cost:
+                key = split_model
+                _model_info = _get_model_info_from_model_cost(key=key)
+                if not _check_provider_match(
+                    model_info=_model_info, custom_llm_provider=custom_llm_provider
+                ):
+                    _model_info = None
+            if _model_info is None or key is None:
+                raise ValueError(
+                    "This model isn't mapped yet. Add it here - https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json"
+                )
+
+            ## PROVIDER-SPECIFIC INFORMATION
+            if custom_llm_provider == "predibase":
+                _model_info["supports_response_schema"] = True
+
+            _input_cost_per_token: Optional[float] = _model_info.get(
+                "input_cost_per_token"
+            )
+            if _input_cost_per_token is None:
+                # default value to 0, be noisy about this
+                verbose_logger.debug(
+                    "model={}, custom_llm_provider={} has no input_cost_per_token in model_cost_map. Defaulting to 0.".format(
+                        model, custom_llm_provider
+                    )
+                )
+                _input_cost_per_token = 0
+
+            _output_cost_per_token: Optional[float] = _model_info.get(
+                "output_cost_per_token"
+            )
+            if _output_cost_per_token is None:
+                # default value to 0, be noisy about this
+                verbose_logger.debug(
+                    "model={}, custom_llm_provider={} has no output_cost_per_token in model_cost_map. Defaulting to 0.".format(
+                        model, custom_llm_provider
+                    )
+                )
+                _output_cost_per_token = 0
+
+            return ModelInfoBase(
+                key=key,
+                max_tokens=_model_info.get("max_tokens", None),
+                max_input_tokens=_model_info.get("max_input_tokens", None),
+                max_output_tokens=_model_info.get("max_output_tokens", None),
+                input_cost_per_token=_input_cost_per_token,
+                cache_creation_input_token_cost=_model_info.get(
+                    "cache_creation_input_token_cost", None
+                ),
+                cache_read_input_token_cost=_model_info.get(
+                    "cache_read_input_token_cost", None
+                ),
+                input_cost_per_character=_model_info.get(
+                    "input_cost_per_character", None
+                ),
+                input_cost_per_token_above_128k_tokens=_model_info.get(
+                    "input_cost_per_token_above_128k_tokens", None
+                ),
+                input_cost_per_query=_model_info.get("input_cost_per_query", None),
+                input_cost_per_second=_model_info.get("input_cost_per_second", None),
+                input_cost_per_audio_token=_model_info.get(
+                    "input_cost_per_audio_token", None
+                ),
+                output_cost_per_token=_output_cost_per_token,
+                output_cost_per_audio_token=_model_info.get(
+                    "output_cost_per_audio_token", None
+                ),
+                output_cost_per_character=_model_info.get(
+                    "output_cost_per_character", None
+                ),
+                output_cost_per_token_above_128k_tokens=_model_info.get(
+                    "output_cost_per_token_above_128k_tokens", None
+                ),
+                output_cost_per_character_above_128k_tokens=_model_info.get(
+                    "output_cost_per_character_above_128k_tokens", None
+                ),
+                output_cost_per_second=_model_info.get("output_cost_per_second", None),
+                output_cost_per_image=_model_info.get("output_cost_per_image", None),
+                output_vector_size=_model_info.get("output_vector_size", None),
+                litellm_provider=_model_info.get(
+                    "litellm_provider", custom_llm_provider
+                ),
+                mode=_model_info.get("mode"),  # type: ignore
+                supports_system_messages=_model_info.get(
+                    "supports_system_messages", None
+                ),
+                supports_response_schema=_model_info.get(
+                    "supports_response_schema", None
+                ),
+                supports_vision=_model_info.get("supports_vision", False),
+                supports_function_calling=_model_info.get(
+                    "supports_function_calling", False
+                ),
+                supports_assistant_prefill=_model_info.get(
+                    "supports_assistant_prefill", False
+                ),
+                supports_prompt_caching=_model_info.get(
+                    "supports_prompt_caching", False
+                ),
+                supports_audio_input=_model_info.get("supports_audio_input", False),
+                supports_audio_output=_model_info.get("supports_audio_output", False),
+                supports_pdf_input=_model_info.get("supports_pdf_input", False),
+                tpm=_model_info.get("tpm", None),
+                rpm=_model_info.get("rpm", None),
+            )
+    except Exception as e:
+        if "OllamaError" in str(e):
+            raise e
+        raise Exception(
+            "This model isn't mapped yet. model={}, custom_llm_provider={}. Add it here - https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json.".format(
+                model, custom_llm_provider
+            )
+        )
+
+
+def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> ModelInfo:
    """
    Get a dict for the maximum tokens (context window), input_cost_per_token, output_cost_per_token  for a given model.

@ -4265,241 +4465,20 @@ def get_model_info(  # noqa: PLR0915
            "supported_openai_params": ["temperature", "max_tokens", "top_p", "frequency_penalty", "presence_penalty"]
        }
    """
-    supported_openai_params: Union[List[str], None] = []
-
-    def _get_max_position_embeddings(model_name):
-        # Construct the URL for the config.json file
-        config_url = f"https://huggingface.co/{model_name}/raw/main/config.json"
-
-        try:
-            # Make the HTTP request to get the raw JSON file
-            response = litellm.module_level_client.get(config_url)
-            response.raise_for_status()  # Raise an exception for bad responses (4xx or 5xx)
-
-            # Parse the JSON response
-            config_json = response.json()
-
-            # Extract and return the max_position_embeddings
-            max_position_embeddings = config_json.get("max_position_embeddings")
-
-            if max_position_embeddings is not None:
-                return max_position_embeddings
-            else:
-                return None
-        except Exception:
-            return None
-
-    try:
-        azure_llms = {**litellm.azure_llms, **litellm.azure_embedding_models}
-        if model in azure_llms:
-            model = azure_llms[model]
-        if custom_llm_provider is not None and custom_llm_provider == "vertex_ai_beta":
-            custom_llm_provider = "vertex_ai"
-        if custom_llm_provider is not None and custom_llm_provider == "vertex_ai":
-            if "meta/" + model in litellm.vertex_llama3_models:
-                model = "meta/" + model
-            elif model + "@latest" in litellm.vertex_mistral_models:
-                model = model + "@latest"
-            elif model + "@latest" in litellm.vertex_ai_ai21_models:
-                model = model + "@latest"
-        ##########################
-        potential_model_names = _get_potential_model_names(
-            model=model, custom_llm_provider=custom_llm_provider
-        )
-        combined_model_name = potential_model_names["combined_model_name"]
-        stripped_model_name = potential_model_names["stripped_model_name"]
-        combined_stripped_model_name = potential_model_names[
-            "combined_stripped_model_name"
-        ]
-        split_model = potential_model_names["split_model"]
-        custom_llm_provider = potential_model_names["custom_llm_provider"]
-        #########################
    supported_openai_params = litellm.get_supported_openai_params(
        model=model, custom_llm_provider=custom_llm_provider
    )
-        if custom_llm_provider == "huggingface":
-            max_tokens = _get_max_position_embeddings(model_name=model)
-            return ModelInfo(
-                key=model,
-                max_tokens=max_tokens,  # type: ignore
-                max_input_tokens=None,
-                max_output_tokens=None,
-                input_cost_per_token=0,
-                output_cost_per_token=0,
-                litellm_provider="huggingface",
-                mode="chat",
-                supported_openai_params=supported_openai_params,
-                supports_system_messages=None,
-                supports_response_schema=None,
-                supports_function_calling=None,
-                supports_assistant_prefill=None,
-                supports_prompt_caching=None,
-                supports_pdf_input=None,
-            )
-        elif custom_llm_provider == "ollama" or custom_llm_provider == "ollama_chat":
-            return litellm.OllamaConfig().get_model_info(model)
-        else:
-            """
-            Check if: (in order of specificity)
-            1. 'custom_llm_provider/model' in litellm.model_cost. Checks "groq/llama3-8b-8192" if model="llama3-8b-8192" and custom_llm_provider="groq"
-            2. 'model' in litellm.model_cost. Checks "gemini-1.5-pro-002" in  litellm.model_cost if model="gemini-1.5-pro-002" and custom_llm_provider=None
-            3. 'combined_stripped_model_name' in litellm.model_cost. Checks if 'gemini/gemini-1.5-flash' in model map, if 'gemini/gemini-1.5-flash-001' given.
-            4. 'stripped_model_name' in litellm.model_cost. Checks if 'ft:gpt-3.5-turbo' in model map, if 'ft:gpt-3.5-turbo:my-org:custom_suffix:id' given.
-            5. 'split_model' in litellm.model_cost. Checks "llama3-8b-8192" in litellm.model_cost if model="groq/llama3-8b-8192"
-            """

-            _model_info: Optional[Dict[str, Any]] = None
-            key: Optional[str] = None
-            if combined_model_name in litellm.model_cost:
-                key = combined_model_name
-                _model_info = _get_model_info_from_model_cost(key=key)
-                _model_info["supported_openai_params"] = supported_openai_params
-                if not _check_provider_match(
-                    model_info=_model_info, custom_llm_provider=custom_llm_provider
-                ):
-                    _model_info = None
-            if _model_info is None and model in litellm.model_cost:
-                key = model
-                _model_info = _get_model_info_from_model_cost(key=key)
-                _model_info["supported_openai_params"] = supported_openai_params
-                if not _check_provider_match(
-                    model_info=_model_info, custom_llm_provider=custom_llm_provider
-                ):
-                    _model_info = None
-            if (
-                _model_info is None
-                and combined_stripped_model_name in litellm.model_cost
-            ):
-                key = combined_stripped_model_name
-                _model_info = _get_model_info_from_model_cost(key=key)
-                _model_info["supported_openai_params"] = supported_openai_params
-                if not _check_provider_match(
-                    model_info=_model_info, custom_llm_provider=custom_llm_provider
-                ):
-                    _model_info = None
-            if _model_info is None and stripped_model_name in litellm.model_cost:
-                key = stripped_model_name
-                _model_info = _get_model_info_from_model_cost(key=key)
-                _model_info["supported_openai_params"] = supported_openai_params
-                if not _check_provider_match(
-                    model_info=_model_info, custom_llm_provider=custom_llm_provider
-                ):
-                    _model_info = None
-            if _model_info is None and split_model in litellm.model_cost:
-                key = split_model
-                _model_info = _get_model_info_from_model_cost(key=key)
-                _model_info["supported_openai_params"] = supported_openai_params
-                if not _check_provider_match(
-                    model_info=_model_info, custom_llm_provider=custom_llm_provider
-                ):
-                    _model_info = None
-            if _model_info is None or key is None:
-                raise ValueError(
-                    "This model isn't mapped yet. Add it here - https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json"
+    _model_info = _get_model_info_helper(
+        model=model,
+        custom_llm_provider=custom_llm_provider,
    )

-            ## PROVIDER-SPECIFIC INFORMATION
-            if custom_llm_provider == "predibase":
-                _model_info["supports_response_schema"] = True
+    returned_model_info = ModelInfo(
+        **_model_info, supported_openai_params=supported_openai_params
+    )

-            _input_cost_per_token: Optional[float] = _model_info.get(
-                "input_cost_per_token"
-            )
-            if _input_cost_per_token is None:
-                # default value to 0, be noisy about this
-                verbose_logger.debug(
-                    "model={}, custom_llm_provider={} has no input_cost_per_token in model_cost_map. Defaulting to 0.".format(
-                        model, custom_llm_provider
-                    )
-                )
-                _input_cost_per_token = 0
-
-            _output_cost_per_token: Optional[float] = _model_info.get(
-                "output_cost_per_token"
-            )
-            if _output_cost_per_token is None:
-                # default value to 0, be noisy about this
-                verbose_logger.debug(
-                    "model={}, custom_llm_provider={} has no output_cost_per_token in model_cost_map. Defaulting to 0.".format(
-                        model, custom_llm_provider
-                    )
-                )
-                _output_cost_per_token = 0
-
-            return ModelInfo(
-                key=key,
-                max_tokens=_model_info.get("max_tokens", None),
-                max_input_tokens=_model_info.get("max_input_tokens", None),
-                max_output_tokens=_model_info.get("max_output_tokens", None),
-                input_cost_per_token=_input_cost_per_token,
-                cache_creation_input_token_cost=_model_info.get(
-                    "cache_creation_input_token_cost", None
-                ),
-                cache_read_input_token_cost=_model_info.get(
-                    "cache_read_input_token_cost", None
-                ),
-                input_cost_per_character=_model_info.get(
-                    "input_cost_per_character", None
-                ),
-                input_cost_per_token_above_128k_tokens=_model_info.get(
-                    "input_cost_per_token_above_128k_tokens", None
-                ),
-                input_cost_per_query=_model_info.get("input_cost_per_query", None),
-                input_cost_per_second=_model_info.get("input_cost_per_second", None),
-                input_cost_per_audio_token=_model_info.get(
-                    "input_cost_per_audio_token", None
-                ),
-                output_cost_per_token=_output_cost_per_token,
-                output_cost_per_audio_token=_model_info.get(
-                    "output_cost_per_audio_token", None
-                ),
-                output_cost_per_character=_model_info.get(
-                    "output_cost_per_character", None
-                ),
-                output_cost_per_token_above_128k_tokens=_model_info.get(
-                    "output_cost_per_token_above_128k_tokens", None
-                ),
-                output_cost_per_character_above_128k_tokens=_model_info.get(
-                    "output_cost_per_character_above_128k_tokens", None
-                ),
-                output_cost_per_second=_model_info.get("output_cost_per_second", None),
-                output_cost_per_image=_model_info.get("output_cost_per_image", None),
-                output_vector_size=_model_info.get("output_vector_size", None),
-                litellm_provider=_model_info.get(
-                    "litellm_provider", custom_llm_provider
-                ),
-                mode=_model_info.get("mode"),  # type: ignore
-                supported_openai_params=supported_openai_params,
-                supports_system_messages=_model_info.get(
-                    "supports_system_messages", None
-                ),
-                supports_response_schema=_model_info.get(
-                    "supports_response_schema", None
-                ),
-                supports_vision=_model_info.get("supports_vision", False),
-                supports_function_calling=_model_info.get(
-                    "supports_function_calling", False
-                ),
-                supports_assistant_prefill=_model_info.get(
-                    "supports_assistant_prefill", False
-                ),
-                supports_prompt_caching=_model_info.get(
-                    "supports_prompt_caching", False
-                ),
-                supports_audio_input=_model_info.get("supports_audio_input", False),
-                supports_audio_output=_model_info.get("supports_audio_output", False),
-                supports_pdf_input=_model_info.get("supports_pdf_input", False),
-                tpm=_model_info.get("tpm", None),
-                rpm=_model_info.get("rpm", None),
-            )
-    except Exception as e:
-        if "OllamaError" in str(e):
-            raise e
-        raise Exception(
-            "This model isn't mapped yet. model={}, custom_llm_provider={}. Add it here - https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json.".format(
-                model, custom_llm_provider
-            )
-        )
+    return returned_model_info


 def json_schema_type(python_type_name: str):
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -13,7 +13,8 @@
        "supports_audio_input": true, 
        "supports_audio_output": true,
        "supports_prompt_caching": true,
-        "supports_response_schema": true
+        "supports_response_schema": true,
+        "supports_system_messages": true
    },
    "sambanova/Meta-Llama-3.1-8B-Instruct": {
        "max_tokens": 16000,
@ -94,7 +95,8 @@
        "litellm_provider": "openai",
        "mode": "chat",
        "supports_function_calling": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-4o": {
        "max_tokens": 16384,
@ -109,7 +111,8 @@
        "supports_parallel_function_calling": true,
        "supports_response_schema": true,
        "supports_vision": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-4o-audio-preview": {
        "max_tokens": 16384,
@ -124,7 +127,8 @@
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
        "supports_audio_input": true,
-        "supports_audio_output": true
+        "supports_audio_output": true,
+        "supports_system_messages": true
    },
    "gpt-4o-audio-preview-2024-10-01": {
        "max_tokens": 16384,
@ -139,7 +143,8 @@
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
        "supports_audio_input": true,
-        "supports_audio_output": true
+        "supports_audio_output": true,
+        "supports_system_messages": true
    },
    "gpt-4o-mini-audio-preview-2024-12-17": {
        "max_tokens": 16384,
@ -154,7 +159,8 @@
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
        "supports_audio_input": true,
-        "supports_audio_output": true
+        "supports_audio_output": true,
+        "supports_system_messages": true
    },
    "gpt-4o-mini": {
        "max_tokens": 16384,
@ -169,7 +175,8 @@
        "supports_parallel_function_calling": true,
        "supports_response_schema": true,
        "supports_vision": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-4o-mini-2024-07-18": {
        "max_tokens": 16384,
@ -184,7 +191,8 @@
        "supports_parallel_function_calling": true,
        "supports_response_schema": true,
        "supports_vision": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "o1": {
        "max_tokens": 100000,
@ -198,7 +206,9 @@
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
        "supports_vision": false,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true,
+        "supports_response_schema": true
    },
    "o1-mini": {
        "max_tokens": 65536,
@ -209,8 +219,6 @@
        "cache_read_input_token_cost": 0.0000015,
        "litellm_provider": "openai",
        "mode": "chat",
-        "supports_function_calling": true,
-        "supports_parallel_function_calling": true,
        "supports_vision": false,
        "supports_prompt_caching": true
    },
@ -223,8 +231,6 @@
        "cache_read_input_token_cost": 0.0000015,
        "litellm_provider": "openai",
        "mode": "chat",
-        "supports_function_calling": true,
-        "supports_parallel_function_calling": true,
        "supports_vision": false,
        "supports_prompt_caching": true
    },
@ -237,8 +243,6 @@
        "cache_read_input_token_cost": 0.0000075,
        "litellm_provider": "openai",
        "mode": "chat",
-        "supports_function_calling": true,
-        "supports_parallel_function_calling": true,
        "supports_vision": false,
        "supports_prompt_caching": true
    },
@ -251,8 +255,6 @@
        "cache_read_input_token_cost": 0.0000075,
        "litellm_provider": "openai",
        "mode": "chat",
-        "supports_function_calling": true,
-        "supports_parallel_function_calling": true,
        "supports_vision": false,
        "supports_prompt_caching": true
    },
@ -268,7 +270,9 @@
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
        "supports_vision": false,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true,
+        "supports_response_schema": true
    },
    "chatgpt-4o-latest": {
        "max_tokens": 4096,
@ -281,7 +285,8 @@
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
        "supports_vision": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-4o-2024-05-13": {
        "max_tokens": 4096,
@ -294,7 +299,8 @@
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
        "supports_vision": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-4o-2024-08-06": {
        "max_tokens": 16384,
@ -309,7 +315,8 @@
        "supports_parallel_function_calling": true,
        "supports_response_schema": true,
        "supports_vision": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-4o-2024-11-20": {
        "max_tokens": 16384,
@ -324,7 +331,8 @@
        "supports_parallel_function_calling": true,
        "supports_response_schema": true,
        "supports_vision": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-4o-realtime-preview-2024-10-01": {
        "max_tokens": 4096,
@ -341,7 +349,8 @@
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
        "supports_audio_input": true,
-        "supports_audio_output": true
+        "supports_audio_output": true,
+        "supports_system_messages": true
    },
    "gpt-4o-realtime-preview": {
        "max_tokens": 4096,
@ -357,7 +366,8 @@
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
        "supports_audio_input": true,
-        "supports_audio_output": true
+        "supports_audio_output": true,
+        "supports_system_messages": true
    },
    "gpt-4o-realtime-preview-2024-12-17": {
        "max_tokens": 4096,
@ -373,7 +383,8 @@
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
        "supports_audio_input": true,
-        "supports_audio_output": true
+        "supports_audio_output": true,
+        "supports_system_messages": true
    },
    "gpt-4o-mini-realtime-preview": {
        "max_tokens": 4096,
@ -390,7 +401,8 @@
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
        "supports_audio_input": true,
-        "supports_audio_output": true
+        "supports_audio_output": true,
+        "supports_system_messages": true
    },
    "gpt-4o-mini-realtime-preview-2024-12-17": {
        "max_tokens": 4096,
@ -407,7 +419,8 @@
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
        "supports_audio_input": true,
-        "supports_audio_output": true
+        "supports_audio_output": true,
+        "supports_system_messages": true
    },
    "gpt-4-turbo-preview": {
        "max_tokens": 4096,
@ -419,7 +432,8 @@
        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-4-0314": {
        "max_tokens": 4096,
@ -429,7 +443,8 @@
        "output_cost_per_token": 0.00006,
        "litellm_provider": "openai",
        "mode": "chat",
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-4-0613": {
        "max_tokens": 4096,
@ -440,7 +455,8 @@
        "litellm_provider": "openai",
        "mode": "chat",
        "supports_function_calling": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-4-32k": {
        "max_tokens": 4096,
@ -450,7 +466,8 @@
        "output_cost_per_token": 0.00012,
        "litellm_provider": "openai",
        "mode": "chat",
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-4-32k-0314": {
        "max_tokens": 4096,
@ -460,7 +477,8 @@
        "output_cost_per_token": 0.00012,
        "litellm_provider": "openai",
        "mode": "chat",
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-4-32k-0613": {
        "max_tokens": 4096,
@ -470,7 +488,8 @@
        "output_cost_per_token": 0.00012,
        "litellm_provider": "openai",
        "mode": "chat",
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-4-turbo": {
        "max_tokens": 4096,
@ -483,7 +502,8 @@
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
        "supports_vision": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-4-turbo-2024-04-09": {
        "max_tokens": 4096,
@ -496,7 +516,8 @@
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
        "supports_vision": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-4-1106-preview": {
        "max_tokens": 4096,
@ -508,7 +529,8 @@
        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-4-0125-preview": {
        "max_tokens": 4096,
@ -520,7 +542,8 @@
        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-4-vision-preview": {
        "max_tokens": 4096,
@ -531,7 +554,8 @@
        "litellm_provider": "openai",
        "mode": "chat",
        "supports_vision": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-4-1106-vision-preview": {
        "max_tokens": 4096,
@ -542,7 +566,8 @@
        "litellm_provider": "openai",
        "mode": "chat",
        "supports_vision": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-3.5-turbo": {
        "max_tokens": 4097,
@ -553,7 +578,8 @@
        "litellm_provider": "openai",
        "mode": "chat",
        "supports_function_calling": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-3.5-turbo-0301": {
        "max_tokens": 4097,
@ -563,7 +589,8 @@
        "output_cost_per_token": 0.000002,
        "litellm_provider": "openai",
        "mode": "chat",
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-3.5-turbo-0613": {
        "max_tokens": 4097,
@ -574,7 +601,8 @@
        "litellm_provider": "openai",
        "mode": "chat",
        "supports_function_calling": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-3.5-turbo-1106": {
        "max_tokens": 16385,
@ -586,7 +614,8 @@
        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-3.5-turbo-0125": {
        "max_tokens": 16385,
@ -598,7 +627,8 @@
        "mode": "chat",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-3.5-turbo-16k": {
        "max_tokens": 16385,
@ -608,7 +638,8 @@
        "output_cost_per_token": 0.000004,
        "litellm_provider": "openai",
        "mode": "chat",
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "gpt-3.5-turbo-16k-0613": {
        "max_tokens": 16385,
@ -618,7 +649,8 @@
        "output_cost_per_token": 0.000004,
        "litellm_provider": "openai",
        "mode": "chat",
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "ft:gpt-3.5-turbo": {
        "max_tokens": 4096,
@ -627,7 +659,8 @@
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000006,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_system_messages": true
    },
    "ft:gpt-3.5-turbo-0125": {
        "max_tokens": 4096,
@ -636,7 +669,8 @@
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000006,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_system_messages": true
    },
    "ft:gpt-3.5-turbo-1106": {
        "max_tokens": 4096,
@ -645,7 +679,8 @@
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000006,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_system_messages": true
    },
    "ft:gpt-3.5-turbo-0613": {
        "max_tokens": 4096,
@ -654,7 +689,8 @@
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000006,
        "litellm_provider": "openai",
-        "mode": "chat"
+        "mode": "chat",
+        "supports_system_messages": true
    },
    "ft:gpt-4-0613": {
        "max_tokens": 4096,
@ -665,7 +701,8 @@
        "litellm_provider": "openai",
        "mode": "chat",
        "supports_function_calling": true,
-        "source": "OpenAI needs to add pricing for this ft model, will be updated when added by OpenAI. Defaulting to base model pricing"
+        "source": "OpenAI needs to add pricing for this ft model, will be updated when added by OpenAI. Defaulting to base model pricing",
+        "supports_system_messages": true
    },
    "ft:gpt-4o-2024-08-06": {
        "max_tokens": 16384,
@ -678,7 +715,8 @@
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
        "supports_response_schema": true,
-        "supports_vision": true
+        "supports_vision": true,
+        "supports_system_messages": true
    },
    "ft:gpt-4o-2024-11-20": {
        "max_tokens": 16384,
@ -693,7 +731,8 @@
        "supports_parallel_function_calling": true,
        "supports_response_schema": true,
        "supports_vision": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "ft:gpt-4o-mini-2024-07-18": {
        "max_tokens": 16384,
@ -708,7 +747,8 @@
        "supports_parallel_function_calling": true,
        "supports_response_schema": true,
        "supports_vision": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_system_messages": true
    },
    "ft:davinci-002": {
        "max_tokens": 16384,
@ -3166,6 +3206,42 @@
        "supports_function_calling": true,
        "supports_vision": true,
        "supports_response_schema": true,
+        "supports_audio_output": true,
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-2.0-flash"
+    },
+    "gemini/gemini-2.0-flash-exp": {
+        "max_tokens": 8192,
+        "max_input_tokens": 1048576,
+        "max_output_tokens": 8192,
+        "max_images_per_prompt": 3000,
+        "max_videos_per_prompt": 10,
+        "max_video_length": 1,
+        "max_audio_length_hours": 8.4,
+        "max_audio_per_prompt": 1,
+        "max_pdf_size_mb": 30,
+        "input_cost_per_image": 0,
+        "input_cost_per_video_per_second": 0,
+        "input_cost_per_audio_per_second": 0,
+        "input_cost_per_token": 0,
+        "input_cost_per_character": 0, 
+        "input_cost_per_token_above_128k_tokens": 0, 
+        "input_cost_per_character_above_128k_tokens": 0, 
+        "input_cost_per_image_above_128k_tokens": 0,
+        "input_cost_per_video_per_second_above_128k_tokens": 0,
+        "input_cost_per_audio_per_second_above_128k_tokens": 0,
+        "output_cost_per_token": 0,
+        "output_cost_per_character": 0,
+        "output_cost_per_token_above_128k_tokens": 0,
+        "output_cost_per_character_above_128k_tokens": 0,
+        "litellm_provider": "gemini",
+        "mode": "chat",
+        "supports_system_messages": true,
+        "supports_function_calling": true,
+        "supports_vision": true,
+        "supports_response_schema": true,
+        "supports_audio_output": true,
+        "tpm": 4000000,
+        "rpm": 10,
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-2.0-flash"
    },
    "vertex_ai/claude-3-sonnet": {
--- a/tests/llm_translation/test_openai_o1.py
+++ b/tests/llm_translation/test_openai_o1.py
@ -17,14 +17,19 @@ import litellm
 from litellm import Choices, Message, ModelResponse


+@pytest.mark.parametrize("model", ["o1-preview", "o1-mini", "o1"])
@pytest.mark.asyncio
-async def test_o1_handle_system_role():
+async def test_o1_handle_system_role(model):
    """
    Tests that:
    - max_tokens is translated to 'max_completion_tokens'
    - role 'system' is translated to 'user'
    """
    from openai import AsyncOpenAI
+    from litellm.utils import supports_system_messages
+
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
+    litellm.model_cost = litellm.get_model_cost_map(url="")

    litellm.set_verbose = True

@ -35,9 +40,9 @@ async def test_o1_handle_system_role():
    ) as mock_client:
        try:
            await litellm.acompletion(
-                model="o1-preview",
+                model=model,
                max_tokens=10,
-                messages=[{"role": "system", "content": "Hello!"}],
+                messages=[{"role": "system", "content": "Be a good bot!"}],
                client=client,
            )
        except Exception as e:
@ -48,9 +53,73 @@ async def test_o1_handle_system_role():

        print("request_body: ", request_body)

-        assert request_body["model"] == "o1-preview"
+        assert request_body["model"] == model
        assert request_body["max_completion_tokens"] == 10
-        assert request_body["messages"] == [{"role": "user", "content": "Hello!"}]
+        if supports_system_messages(model, "openai"):
+            assert request_body["messages"] == [
+                {"role": "system", "content": "Be a good bot!"}
+            ]
+        else:
+            assert request_body["messages"] == [
+                {"role": "user", "content": "Be a good bot!"}
+            ]
+
+
+@pytest.mark.parametrize(
+    "model, expected_tool_calling_support",
+    [("o1-preview", False), ("o1-mini", False), ("o1", True)],
+)
+@pytest.mark.asyncio
+async def test_o1_handle_tool_calling_optional_params(
+    model, expected_tool_calling_support
+):
+    """
+    Tests that:
+    - max_tokens is translated to 'max_completion_tokens'
+    - role 'system' is translated to 'user'
+    """
+    from openai import AsyncOpenAI
+    from litellm.utils import ProviderConfigManager
+    from litellm.types.utils import LlmProviders
+
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
+    litellm.model_cost = litellm.get_model_cost_map(url="")
+
+    config = ProviderConfigManager.get_provider_chat_config(
+        model=model, provider=LlmProviders.OPENAI
+    )
+
+    supported_params = config.get_supported_openai_params(model=model)
+
+    assert expected_tool_calling_support == ("tools" in supported_params)
+
+
+# @pytest.mark.parametrize(
+#     "model",
+#     ["o1"],  # "o1-preview", "o1-mini",
+# )
+# @pytest.mark.asyncio
+# async def test_o1_handle_streaming_e2e(model):
+#     """
+#     Tests that:
+#     - max_tokens is translated to 'max_completion_tokens'
+#     - role 'system' is translated to 'user'
+#     """
+#     from openai import AsyncOpenAI
+#     from litellm.utils import ProviderConfigManager
+#     from litellm.litellm_core_utils.streaming_handler import CustomStreamWrapper
+#     from litellm.types.utils import LlmProviders
+
+#     resp = litellm.completion(
+#         model=model,
+#         messages=[{"role": "user", "content": "Hello!"}],
+#         stream=True,
+#     )
+#     assert isinstance(resp, CustomStreamWrapper)
+#     for chunk in resp:
+#         print("chunk: ", chunk)
+
+#     assert True


@pytest.mark.asyncio
--- a/tests/local_testing/test_streaming.py
+++ b/tests/local_testing/test_streaming.py
@ -2072,6 +2072,7 @@ def test_openai_chat_completion_complete_response_call():
        "azure/chatgpt-v-2",
        "claude-3-haiku-20240307",
        "o1-preview",
+        "o1",
        "azure/fake-o1-mini",
    ],
 )