Litellm dev 12 30 2024 p1 (#7480)

* test(azure_openai_o1.py): initial commit with testing for azure openai o1 preview model * fix(base_llm_unit_tests.py): handle azure o1 preview response format tests skip as o1 on azure doesn't support tool calling yet * fix: initial commit of azure o1 handler using openai caller simplifies calling + allows fake streaming logic alr. implemented for openai to just work * feat(azure/o1_handler.py): fake o1 streaming for azure o1 models azure does not currently support streaming for o1 * feat(o1_transformation.py): support overriding 'should_fake_stream' on azure/o1 via 'supports_native_streaming' param on model info enables user to toggle on when azure allows o1 streaming without needing to bump versions * style(router.py): remove 'give feedback/get help' messaging when router is used Prevents noisy messaging Closes https://github.com/BerriAI/litellm/issues/5942 * test: fix azure o1 test * test: fix tests * fix: fix test
2025-04-26 11:14:04 +00:00 · 2024-12-30 21:52:52 -08:00 · 2024-12-30 21:52:52 -08:00 · 0178e75cd9
commit 0178e75cd9
parent f0ed02d3ee
17 changed files with 273 additions and 141 deletions
--- a/litellm/llms/azure/chat/o1_handler.py
+++ b/litellm/llms/azure/chat/o1_handler.py
@ -4,96 +4,48 @@ Handler file for calls to Azure OpenAI's o1 family of models
 Written separately to handle faking streaming for o1 models.
 """
-import asyncio
+from typing import Optional, Union
 from typing import Any, Callable, List, Optional, Union
-from httpx._config import Timeout
+import httpx
 from openai import AsyncAzureOpenAI, AsyncOpenAI, AzureOpenAI, OpenAI
-from litellm.litellm_core_utils.litellm_logging import Logging
+from ...openai.openai import OpenAIChatCompletion
-from litellm.llms.bedrock.chat.invoke_handler import MockResponseIterator
+from ..common_utils import get_azure_openai_client
 from litellm.types.utils import ModelResponse
 from litellm.utils import CustomStreamWrapper
 from ..azure import AzureChatCompletion
-class AzureOpenAIO1ChatCompletion(AzureChatCompletion):
+class AzureOpenAIO1ChatCompletion(OpenAIChatCompletion):
-
+    def _get_openai_client(
    async def mock_async_streaming(
        self,
-        response: Any,
+        is_async: bool,
-        model: Optional[str],
+        api_key: Optional[str] = None,
-        logging_obj: Any,
+        api_base: Optional[str] = None,
-    ):
+        api_version: Optional[str] = None,
-        model_response = await response
+        timeout: Union[float, httpx.Timeout] = httpx.Timeout(None),
-        completion_stream = MockResponseIterator(model_response=model_response)
+        max_retries: Optional[int] = 2,
-        streaming_response = CustomStreamWrapper(
+        organization: Optional[str] = None,
-            completion_stream=completion_stream,
+        client: Optional[
-            model=model,
+            Union[OpenAI, AsyncOpenAI, AzureOpenAI, AsyncAzureOpenAI]
-            custom_llm_provider="azure",
+        ] = None,
-            logging_obj=logging_obj,
+    ) -> Optional[
        Union[
            OpenAI,
            AsyncOpenAI,
            AzureOpenAI,
            AsyncAzureOpenAI,
        ]
    ]:
        # Override to use Azure-specific client initialization
        if isinstance(client, OpenAI) or isinstance(client, AsyncOpenAI):
            client = None
        return get_azure_openai_client(
            api_key=api_key,
            api_base=api_base,
            timeout=timeout,
            max_retries=max_retries,
            organization=organization,
            api_version=api_version,
            client=client,
            _is_async=is_async,
        )
        return streaming_response
    def completion(
        self,
        model: str,
        messages: List,
        model_response: ModelResponse,
        api_key: str,
        api_base: str,
        api_version: str,
        api_type: str,
        azure_ad_token: str,
        dynamic_params: bool,
        print_verbose: Callable[..., Any],
        timeout: Union[float, Timeout],
        logging_obj: Logging,
        optional_params,
        litellm_params,
        logger_fn,
        acompletion: bool = False,
        headers: Optional[dict] = None,
        client=None,
    ):
        stream: Optional[bool] = optional_params.pop("stream", False)
        stream_options: Optional[dict] = optional_params.pop("stream_options", None)
        response = super().completion(
            model,
            messages,
            model_response,
            api_key,
            api_base,
            api_version,
            api_type,
            azure_ad_token,
            dynamic_params,
            print_verbose,
            timeout,
            logging_obj,
            optional_params,
            litellm_params,
            logger_fn,
            acompletion,
            headers,
            client,
        )
        if stream is True:
            if asyncio.iscoroutine(response):
                return self.mock_async_streaming(
                    response=response, model=model, logging_obj=logging_obj  # type: ignore
                )
            completion_stream = MockResponseIterator(model_response=response)
            streaming_response = CustomStreamWrapper(
                completion_stream=completion_stream,
                model=model,
                custom_llm_provider="openai",
                logging_obj=logging_obj,
                stream_options=stream_options,
            )
            return streaming_response
        else:
            return response
--- a/litellm/llms/azure/chat/o1_transformation.py
+++ b/litellm/llms/azure/chat/o1_transformation.py
@ -12,10 +12,41 @@ Translations handled by LiteLLM:
 - Temperature => drop param (if user opts in to dropping param)
 """
 from typing import Optional
 from litellm import verbose_logger
 from litellm.utils import get_model_info
 from ...openai.chat.o1_transformation import OpenAIO1Config
 class AzureOpenAIO1Config(OpenAIO1Config):
    def should_fake_stream(
        self,
        model: Optional[str],
        stream: Optional[bool],
        custom_llm_provider: Optional[str] = None,
    ) -> bool:
        """
        Currently no Azure OpenAI models support native streaming.
        """
        if stream is not True:
            return False
        if model is not None:
            try:
                model_info = get_model_info(
                    model=model, custom_llm_provider=custom_llm_provider
                )
                if model_info.get("supports_native_streaming") is True:
                    return False
            except Exception as e:
                verbose_logger.debug(
                    f"Error getting model info in AzureOpenAIO1Config: {e}"
                )
        return True
    def is_o1_model(self, model: str) -> bool:
        o1_models = ["o1-mini", "o1-preview"]
        for m in o1_models:
--- a/litellm/llms/azure/common_utils.py
+++ b/litellm/llms/azure/common_utils.py
@ -1,7 +1,9 @@
 from typing import Callable, Optional, Union
 import httpx
 from openai import AsyncAzureOpenAI, AzureOpenAI
 import litellm
 from litellm._logging import verbose_logger
 from litellm.llms.base_llm.chat.transformation import BaseLLMException
 from litellm.secret_managers.main import get_secret_str
@ -25,6 +27,39 @@ class AzureOpenAIError(BaseLLMException):
        )
 def get_azure_openai_client(
    api_key: Optional[str],
    api_base: Optional[str],
    timeout: Union[float, httpx.Timeout],
    max_retries: Optional[int],
    api_version: Optional[str] = None,
    organization: Optional[str] = None,
    client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
    _is_async: bool = False,
 ) -> Optional[Union[AzureOpenAI, AsyncAzureOpenAI]]:
    received_args = locals()
    openai_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None
    if client is None:
        data = {}
        for k, v in received_args.items():
            if k == "self" or k == "client" or k == "_is_async":
                pass
            elif k == "api_base" and v is not None:
                data["azure_endpoint"] = v
            elif v is not None:
                data[k] = v
        if "api_version" not in data:
            data["api_version"] = litellm.AZURE_DEFAULT_API_VERSION
        if _is_async is True:
            openai_client = AsyncAzureOpenAI(**data)
        else:
            openai_client = AzureOpenAI(**data)  # type: ignore
    else:
        openai_client = client
    return openai_client
 def process_azure_headers(headers: Union[httpx.Headers, dict]) -> dict:
    openai_headers = {}
    if "x-ratelimit-limit-requests" in headers:
--- a/litellm/llms/azure/files/handler.py
+++ b/litellm/llms/azure/files/handler.py
@ -4,43 +4,11 @@ import httpx
 from openai import AsyncAzureOpenAI, AzureOpenAI
 from openai.types.file_deleted import FileDeleted
 import litellm
 from litellm._logging import verbose_logger
 from litellm.llms.base import BaseLLM
 from litellm.types.llms.openai import *
-
+from ..common_utils import get_azure_openai_client
 def get_azure_openai_client(
    api_key: Optional[str],
    api_base: Optional[str],
    timeout: Union[float, httpx.Timeout],
    max_retries: Optional[int],
    api_version: Optional[str] = None,
    organization: Optional[str] = None,
    client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
    _is_async: bool = False,
 ) -> Optional[Union[AzureOpenAI, AsyncAzureOpenAI]]:
    received_args = locals()
    openai_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None
    if client is None:
        data = {}
        for k, v in received_args.items():
            if k == "self" or k == "client" or k == "_is_async":
                pass
            elif k == "api_base" and v is not None:
                data["azure_endpoint"] = v
            elif v is not None:
                data[k] = v
        if "api_version" not in data:
            data["api_version"] = litellm.AZURE_DEFAULT_API_VERSION
        if _is_async is True:
            openai_client = AsyncAzureOpenAI(**data)
        else:
            openai_client = AzureOpenAI(**data)  # type: ignore
    else:
        openai_client = client
    return openai_client
 class AzureOpenAIFilesAPI(BaseLLM):
--- a/litellm/llms/openai/openai.py
+++ b/litellm/llms/openai/openai.py
@ -275,6 +275,7 @@ class OpenAIChatCompletion(BaseLLM):
        is_async: bool,
        api_key: Optional[str] = None,
        api_base: Optional[str] = None,
        api_version: Optional[str] = None,
        timeout: Union[float, httpx.Timeout] = httpx.Timeout(None),
        max_retries: Optional[int] = 2,
        organization: Optional[str] = None,
@ -423,6 +424,9 @@ class OpenAIChatCompletion(BaseLLM):
        print_verbose: Optional[Callable] = None,
        api_key: Optional[str] = None,
        api_base: Optional[str] = None,
        api_version: Optional[str] = None,
        dynamic_params: Optional[bool] = None,
        azure_ad_token: Optional[str] = None,
        acompletion: bool = False,
        logger_fn=None,
        headers: Optional[dict] = None,
@ -432,6 +436,7 @@ class OpenAIChatCompletion(BaseLLM):
        custom_llm_provider: Optional[str] = None,
        drop_params: Optional[bool] = None,
    ):
        super().completion()
        try:
            fake_stream: bool = False
@ -441,6 +446,7 @@ class OpenAIChatCompletion(BaseLLM):
            )
            stream: Optional[bool] = inference_params.pop("stream", False)
            provider_config: Optional[BaseConfig] = None
            if custom_llm_provider is not None and model is not None:
                provider_config = ProviderConfigManager.get_provider_chat_config(
                    model=model, provider=LlmProviders(custom_llm_provider)
@ -450,6 +456,7 @@ class OpenAIChatCompletion(BaseLLM):
                fake_stream = provider_config.should_fake_stream(
                    model=model, custom_llm_provider=custom_llm_provider, stream=stream
                )
            if headers:
                inference_params["extra_headers"] = headers
            if model is None or messages is None:
@ -469,7 +476,7 @@ class OpenAIChatCompletion(BaseLLM):
            if messages is not None and provider_config is not None:
                if isinstance(provider_config, OpenAIGPTConfig) or isinstance(
                    provider_config, OpenAIConfig
-                ):
+                ):  # [TODO]: remove. no longer needed as .transform_request can just handle this.
                    messages = provider_config._transform_messages(
                        messages=messages, model=model
                    )
@ -504,6 +511,7 @@ class OpenAIChatCompletion(BaseLLM):
                                model=model,
                                api_base=api_base,
                                api_key=api_key,
                                api_version=api_version,
                                timeout=timeout,
                                client=client,
                                max_retries=max_retries,
@ -520,6 +528,7 @@ class OpenAIChatCompletion(BaseLLM):
                                model_response=model_response,
                                api_base=api_base,
                                api_key=api_key,
                                api_version=api_version,
                                timeout=timeout,
                                client=client,
                                max_retries=max_retries,
@ -535,6 +544,7 @@ class OpenAIChatCompletion(BaseLLM):
                            model=model,
                            api_base=api_base,
                            api_key=api_key,
                            api_version=api_version,
                            timeout=timeout,
                            client=client,
                            max_retries=max_retries,
@ -546,11 +556,11 @@ class OpenAIChatCompletion(BaseLLM):
                            raise OpenAIError(
                                status_code=422, message="max retries must be an int"
                            )
                        openai_client: OpenAI = self._get_openai_client(  # type: ignore
                            is_async=False,
                            api_key=api_key,
                            api_base=api_base,
                            api_version=api_version,
                            timeout=timeout,
                            max_retries=max_retries,
                            organization=organization,
@ -667,6 +677,7 @@ class OpenAIChatCompletion(BaseLLM):
        timeout: Union[float, httpx.Timeout],
        api_key: Optional[str] = None,
        api_base: Optional[str] = None,
        api_version: Optional[str] = None,
        organization: Optional[str] = None,
        client=None,
        max_retries=None,
@ -684,6 +695,7 @@ class OpenAIChatCompletion(BaseLLM):
                    is_async=True,
                    api_key=api_key,
                    api_base=api_base,
                    api_version=api_version,
                    timeout=timeout,
                    max_retries=max_retries,
                    organization=organization,
@ -758,6 +770,7 @@ class OpenAIChatCompletion(BaseLLM):
        model: str,
        api_key: Optional[str] = None,
        api_base: Optional[str] = None,
        api_version: Optional[str] = None,
        organization: Optional[str] = None,
        client=None,
        max_retries=None,
@ -767,10 +780,12 @@ class OpenAIChatCompletion(BaseLLM):
        data["stream"] = True
        if stream_options is not None:
            data["stream_options"] = stream_options
        openai_client: OpenAI = self._get_openai_client(  # type: ignore
            is_async=False,
            api_key=api_key,
            api_base=api_base,
            api_version=api_version,
            timeout=timeout,
            max_retries=max_retries,
            organization=organization,
@ -812,6 +827,7 @@ class OpenAIChatCompletion(BaseLLM):
        logging_obj: LiteLLMLoggingObj,
        api_key: Optional[str] = None,
        api_base: Optional[str] = None,
        api_version: Optional[str] = None,
        organization: Optional[str] = None,
        client=None,
        max_retries=None,
@ -829,6 +845,7 @@ class OpenAIChatCompletion(BaseLLM):
                    is_async=True,
                    api_key=api_key,
                    api_base=api_base,
                    api_version=api_version,
                    timeout=timeout,
                    max_retries=max_retries,
                    organization=organization,
--- a/litellm/main.py
+++ b/litellm/main.py
@ -1225,10 +1225,7 @@ def completion(  # type: ignore # noqa: PLR0915
            if extra_headers is not None:
                optional_params["extra_headers"] = extra_headers
-            if (
+            if litellm.AzureOpenAIO1Config().is_o1_model(model=model):
                litellm.enable_preview_features
                and litellm.AzureOpenAIO1Config().is_o1_model(model=model)
            ):
                ## LOAD CONFIG - if set
                config = litellm.AzureOpenAIO1Config.get_config()
                for k, v in config.items():
@ -1244,7 +1241,6 @@ def completion(  # type: ignore # noqa: PLR0915
                    api_key=api_key,
                    api_base=api_base,
                    api_version=api_version,
                    api_type=api_type,
                    dynamic_params=dynamic_params,
                    azure_ad_token=azure_ad_token,
                    model_response=model_response,
@ -1256,6 +1252,7 @@ def completion(  # type: ignore # noqa: PLR0915
                    acompletion=acompletion,
                    timeout=timeout,  # type: ignore
                    client=client,  # pass AsyncAzureOpenAI, AzureOpenAI client
                    custom_llm_provider=custom_llm_provider,
                )
            else:
                ## LOAD CONFIG - if set
--- a/litellm/proxy/_experimental/out/404.html
+++ b/litellm/proxy/_experimental/out/404.html
--- a/litellm/proxy/_experimental/out/model_hub.html
+++ b/litellm/proxy/_experimental/out/model_hub.html
--- a/litellm/proxy/_experimental/out/onboarding.html
+++ b/litellm/proxy/_experimental/out/onboarding.html
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@ -10,4 +10,12 @@ model_list:
      model: openai/o1-*
      api_key: os.environ/OPENAI_API_KEY
    model_info:
-      access_groups: ["restricted-models"]
+      access_groups: ["restricted-models"]
  - model_name: azure-o1-preview
    litellm_params:
      model: azure/o1-preview
      api_key: os.environ/AZURE_OPENAI_O1_KEY
      api_base: os.environ/AZURE_API_BASE
    model_info:
      supports_native_streaming: True
      access_groups: ["shared-models"]
--- a/litellm/router.py
+++ b/litellm/router.py
@ -296,6 +296,7 @@ class Router:
        self.debug_level = debug_level
        self.enable_pre_call_checks = enable_pre_call_checks
        self.enable_tag_filtering = enable_tag_filtering
        litellm.suppress_debug_info = True  # prevents 'Give Feedback/Get help' message from being emitted on Router - Relevant Issue: https://github.com/BerriAI/litellm/issues/5942
        if self.set_verbose is True:
            if debug_level == "INFO":
                verbose_router_logger.setLevel(logging.INFO)
@ -3812,6 +3813,7 @@ class Router:
            _model_name = (
                deployment.litellm_params.custom_llm_provider + "/" + _model_name
            )
        litellm.register_model(
            model_cost={
                _model_name: _model_info,
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@ -86,6 +86,7 @@ class ProviderSpecificModelInfo(TypedDict, total=False):
    supports_embedding_image_input: Optional[bool]
    supports_audio_output: Optional[bool]
    supports_pdf_input: Optional[bool]
    supports_native_streaming: Optional[bool]
 class ModelInfoBase(ProviderSpecificModelInfo, total=False):
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -1893,7 +1893,6 @@ def register_model(model_cost: Union[str, dict]):  # noqa: PLR0915
        },
    }
    """
    loaded_model_cost = {}
    if isinstance(model_cost, dict):
        loaded_model_cost = model_cost
@ -4353,6 +4352,9 @@ def _get_model_info_helper(  # noqa: PLR0915
                supports_embedding_image_input=_model_info.get(
                    "supports_embedding_image_input", False
                ),
                supports_native_streaming=_model_info.get(
                    "supports_native_streaming", None
                ),
                tpm=_model_info.get("tpm", None),
                rpm=_model_info.get("rpm", None),
            )
@ -6050,7 +6052,10 @@ class ProviderConfigManager:
        """
        Returns the provider config for a given provider.
        """
-        if litellm.openAIO1Config.is_model_o1_reasoning_model(model=model):
+        if (
            provider == LlmProviders.OPENAI
            and litellm.openAIO1Config.is_model_o1_reasoning_model(model=model)
        ):
            return litellm.OpenAIO1Config()
        elif litellm.LlmProviders.DEEPSEEK == provider:
            return litellm.DeepSeekChatConfig()
@ -6122,6 +6127,8 @@ class ProviderConfigManager:
        ):
            return litellm.AI21ChatConfig()
        elif litellm.LlmProviders.AZURE == provider:
            if litellm.AzureOpenAIO1Config().is_o1_model(model=model):
                return litellm.AzureOpenAIO1Config()
            return litellm.AzureOpenAIConfig()
        elif litellm.LlmProviders.AZURE_AI == provider:
            return litellm.AzureAIStudioConfig()
--- a/tests/llm_translation/base_llm_unit_tests.py
+++ b/tests/llm_translation/base_llm_unit_tests.py
@ -91,6 +91,40 @@ class BaseLLMChatTest(ABC):
        # for OpenAI the content contains the JSON schema, so we need to assert that the content is not None
        assert response.choices[0].message.content is not None
    def test_streaming(self):
        """Check if litellm handles streaming correctly"""
        base_completion_call_args = self.get_base_completion_call_args()
        litellm.set_verbose = True
        messages = [
            {
                "role": "user",
                "content": [{"type": "text", "text": "Hello, how are you?"}],
            }
        ]
        try:
            response = self.completion_function(
                **base_completion_call_args,
                messages=messages,
                stream=True,
            )
            assert response is not None
            assert isinstance(response, CustomStreamWrapper)
        except litellm.InternalServerError:
            pytest.skip("Model is overloaded")
        # for OpenAI the content contains the JSON schema, so we need to assert that the content is not None
        chunks = []
        for chunk in response:
            print(chunk)
            chunks.append(chunk)
        resp = litellm.stream_chunk_builder(chunks=chunks)
        print(resp)
        # assert resp.usage.prompt_tokens > 0
        # assert resp.usage.completion_tokens > 0
        # assert resp.usage.total_tokens > 0
    def test_pydantic_model_input(self):
        litellm.set_verbose = True
@ -154,9 +188,14 @@ class BaseLLMChatTest(ABC):
        """
        Test that the JSON response format is supported by the LLM API
        """
        from litellm.utils import supports_response_schema
        base_completion_call_args = self.get_base_completion_call_args()
        litellm.set_verbose = True
        if not supports_response_schema(base_completion_call_args["model"], None):
            pytest.skip("Model does not support response schema")
        messages = [
            {
                "role": "system",
@ -225,9 +264,15 @@ class BaseLLMChatTest(ABC):
        """
        Test that the JSON response format with streaming is supported by the LLM API
        """
        from litellm.utils import supports_response_schema
        base_completion_call_args = self.get_base_completion_call_args()
        litellm.set_verbose = True
        base_completion_call_args = self.get_base_completion_call_args()
        if not supports_response_schema(base_completion_call_args["model"], None):
            pytest.skip("Model does not support response schema")
        messages = [
            {
                "role": "system",
--- a/tests/llm_translation/test_azure_o1.py
+++ b/tests/llm_translation/test_azure_o1.py
@ -0,0 +1,65 @@
 import json
 import os
 import sys
 from datetime import datetime
 from unittest.mock import AsyncMock, patch, MagicMock
 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import httpx
 import pytest
 from respx import MockRouter
 import litellm
 from litellm import Choices, Message, ModelResponse
 from base_llm_unit_tests import BaseLLMChatTest
 class TestAzureOpenAIO1(BaseLLMChatTest):
    def get_base_completion_call_args(self):
        return {
            "model": "azure/o1-preview",
            "api_key": os.getenv("AZURE_OPENAI_O1_KEY"),
            "api_base": "https://openai-gpt-4-test-v-1.openai.azure.com",
        }
    def test_tool_call_no_arguments(self, tool_call_no_arguments):
        """Test that tool calls with no arguments is translated correctly. Relevant issue: https://github.com/BerriAI/litellm/issues/6833"""
        pass
    def test_prompt_caching(self):
        """Temporary override. o1 prompt caching is not working."""
        pass
    def test_override_fake_stream(self):
        """Test that native streaming is not supported for o1."""
        router = litellm.Router(
            model_list=[
                {
                    "model_name": "azure/o1-preview",
                    "litellm_params": {
                        "model": "azure/o1-preview",
                        "api_key": "my-fake-o1-key",
                        "api_base": "https://openai-gpt-4-test-v-1.openai.azure.com",
                    },
                    "model_info": {
                        "supports_native_streaming": True,
                    },
                }
            ]
        )
        ## check model info
        model_info = litellm.get_model_info(
            model="azure/o1-preview", custom_llm_provider="azure"
        )
        assert model_info["supports_native_streaming"] is True
        fake_stream = litellm.AzureOpenAIO1Config().should_fake_stream(
            model="azure/o1-preview", stream=True
        )
        assert fake_stream is False
--- a/tests/local_testing/test_alangfuse.py
+++ b/tests/local_testing/test_alangfuse.py
@ -307,6 +307,9 @@ async def test_langfuse_logging_audio_transcriptions(langfuse_client):
@pytest.mark.asyncio
@pytest.mark.skip(
    reason="langfuse now takes 5-10 mins to get this trace. Need to figure out how to test this"
 )
 async def test_langfuse_masked_input_output(langfuse_client):
    """
    Test that creates a trace with masked input and output
--- a/tests/local_testing/test_get_model_info.py
+++ b/tests/local_testing/test_get_model_info.py
@ -219,6 +219,7 @@ def test_model_info_bedrock_converse(monkeypatch):
    )
@pytest.mark.flaky(retries=6, delay=2)
 def test_model_info_bedrock_converse_enforcement(monkeypatch):
    """
    Test the enforcement of the whitelist by adding a fake model and ensuring the test fails.
@ -232,12 +233,15 @@ def test_model_info_bedrock_converse_enforcement(monkeypatch):
        "mode": "chat",
    }
-    # Load whitelist models from file
+    try:
-    with open("whitelisted_bedrock_models.txt", "r") as file:
+        # Load whitelist models from file
-        whitelist_models = [line.strip() for line in file.readlines()]
+        with open("whitelisted_bedrock_models.txt", "r") as file:
            whitelist_models = [line.strip() for line in file.readlines()]
-    # Check for unwhitelisted models
+        # Check for unwhitelisted models
-    with pytest.raises(AssertionError):
+        with pytest.raises(AssertionError):
-        _enforce_bedrock_converse_models(
+            _enforce_bedrock_converse_models(
-            model_cost=litellm.model_cost, whitelist_models=whitelist_models
+                model_cost=litellm.model_cost, whitelist_models=whitelist_models
-        )
+            )
    except FileNotFoundError as e:
        pytest.skip("whitelisted_bedrock_models.txt not found")