fix(azure.py,-openai.py): correctly raise errors if streaming calls fail

2023-12-27 15:08:37 +05:30 · 2023-12-27 15:08:37 +05:30 · c9fdbaf898
commit c9fdbaf898
parent 9ba520cc8b
6 changed files with 110 additions and 24 deletions
--- a/litellm/llms/azure.py
+++ b/litellm/llms/azure.py
@ -427,14 +427,14 @@ class AzureChatCompletion(BaseLLM):
            },
        )
        response = await azure_client.chat.completions.create(**data)
        # return response
        streamwrapper = CustomStreamWrapper(
            completion_stream=response,
            model=model,
            custom_llm_provider="azure",
            logging_obj=logging_obj,
        )
-        async for transformed_chunk in streamwrapper:
+        return streamwrapper  ## DO NOT make this into an async for ... loop, it will yield an async generator, which won't raise errors if the response fails
            yield transformed_chunk
    async def aembedding(
        self,
--- a/litellm/llms/openai.py
+++ b/litellm/llms/openai.py
@ -482,8 +482,7 @@ class OpenAIChatCompletion(BaseLLM):
                custom_llm_provider="openai",
                logging_obj=logging_obj,
            )
-            async for transformed_chunk in streamwrapper:
+            return streamwrapper
                yield transformed_chunk
        except (
            Exception
        ) as e:  # need to exception handle here. async exceptions don't get caught in sync functions.
--- a/litellm/main.py
+++ b/litellm/main.py
@ -198,18 +198,16 @@ async def acompletion(*args, **kwargs):
            or custom_llm_provider == "ollama"
            or custom_llm_provider == "ollama_chat"
            or custom_llm_provider == "vertex_ai"
-        ):  # currently implemented aiohttp calls for just azure and openai, soon all.
+        ):  # currently implemented aiohttp calls for just azure, openai, hf, ollama, vertex ai soon all.
-            if kwargs.get("stream", False):
+            init_response = await loop.run_in_executor(None, func_with_context)
-                response = completion(*args, **kwargs)
+            if isinstance(init_response, dict) or isinstance(
                init_response, ModelResponse
            ):  ## CACHING SCENARIO
                response = init_response
            elif asyncio.iscoroutine(init_response):
                response = await init_response
            else:
-                # Await normally
+                response = init_response
                init_response = await loop.run_in_executor(None, func_with_context)
                if isinstance(init_response, dict) or isinstance(
                    init_response, ModelResponse
                ):  ## CACHING SCENARIO
                    response = init_response
                elif asyncio.iscoroutine(init_response):
                    response = await init_response
        else:
            # Call the synchronous function using run_in_executor
            response = await loop.run_in_executor(None, func_with_context)
--- a/litellm/tests/test_router_fallbacks.py
+++ b/litellm/tests/test_router_fallbacks.py
@ -21,6 +21,13 @@ class MyCustomHandler(CustomLogger):
    def log_pre_api_call(self, model, messages, kwargs):
        print(f"Pre-API Call")
        print(
            f"previous_models: {kwargs['litellm_params']['metadata']['previous_models']}"
        )
        self.previous_models += len(
            kwargs["litellm_params"]["metadata"]["previous_models"]
        )  # {"previous_models": [{"model": litellm_model_name, "exception_type": AuthenticationError, "exception_string": <complete_traceback>}]}
        print(f"self.previous_models: {self.previous_models}")
    def log_post_api_call(self, kwargs, response_obj, start_time, end_time):
        print(
@ -34,13 +41,6 @@ class MyCustomHandler(CustomLogger):
        print(f"On Stream")
    def log_success_event(self, kwargs, response_obj, start_time, end_time):
        print(
            f"previous_models: {kwargs['litellm_params']['metadata']['previous_models']}"
        )
        self.previous_models += len(
            kwargs["litellm_params"]["metadata"]["previous_models"]
        )  # {"previous_models": [{"model": litellm_model_name, "exception_type": AuthenticationError, "exception_string": <complete_traceback>}]}
        print(f"self.previous_models: {self.previous_models}")
        print(f"On Success")
    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
@ -396,3 +396,89 @@ async def test_dynamic_fallbacks_async():
 # asyncio.run(test_dynamic_fallbacks_async())
@pytest.mark.asyncio
 async def test_async_fallbacks_streaming():
    litellm.set_verbose = False
    model_list = [
        {  # list of model deployments
            "model_name": "azure/gpt-3.5-turbo",  # openai model name
            "litellm_params": {  # params for litellm completion/embedding call
                "model": "azure/chatgpt-v-2",
                "api_key": "bad-key",
                "api_version": os.getenv("AZURE_API_VERSION"),
                "api_base": os.getenv("AZURE_API_BASE"),
            },
            "tpm": 240000,
            "rpm": 1800,
        },
        {  # list of model deployments
            "model_name": "azure/gpt-3.5-turbo-context-fallback",  # openai model name
            "litellm_params": {  # params for litellm completion/embedding call
                "model": "azure/chatgpt-v-2",
                "api_key": os.getenv("AZURE_API_KEY"),
                "api_version": os.getenv("AZURE_API_VERSION"),
                "api_base": os.getenv("AZURE_API_BASE"),
            },
            "tpm": 240000,
            "rpm": 1800,
        },
        {
            "model_name": "azure/gpt-3.5-turbo",  # openai model name
            "litellm_params": {  # params for litellm completion/embedding call
                "model": "azure/chatgpt-functioncalling",
                "api_key": "bad-key",
                "api_version": os.getenv("AZURE_API_VERSION"),
                "api_base": os.getenv("AZURE_API_BASE"),
            },
            "tpm": 240000,
            "rpm": 1800,
        },
        {
            "model_name": "gpt-3.5-turbo",  # openai model name
            "litellm_params": {  # params for litellm completion/embedding call
                "model": "gpt-3.5-turbo",
                "api_key": os.getenv("OPENAI_API_KEY"),
            },
            "tpm": 1000000,
            "rpm": 9000,
        },
        {
            "model_name": "gpt-3.5-turbo-16k",  # openai model name
            "litellm_params": {  # params for litellm completion/embedding call
                "model": "gpt-3.5-turbo-16k",
                "api_key": os.getenv("OPENAI_API_KEY"),
            },
            "tpm": 1000000,
            "rpm": 9000,
        },
    ]
    router = Router(
        model_list=model_list,
        fallbacks=[{"azure/gpt-3.5-turbo": ["gpt-3.5-turbo"]}],
        context_window_fallbacks=[
            {"azure/gpt-3.5-turbo-context-fallback": ["gpt-3.5-turbo-16k"]},
            {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]},
        ],
        set_verbose=False,
    )
    customHandler = MyCustomHandler()
    litellm.callbacks = [customHandler]
    user_message = "Hello, how are you?"
    messages = [{"content": user_message, "role": "user"}]
    try:
        response = await router.acompletion(**kwargs, stream=True)
        print(f"customHandler.previous_models: {customHandler.previous_models}")
        await asyncio.sleep(
            0.05
        )  # allow a delay as success_callbacks are on a separate thread
        assert customHandler.previous_models == 1  # 0 retries, 1 fallback
        router.reset()
    except litellm.Timeout as e:
        pass
    except Exception as e:
        pytest.fail(f"An exception occurred: {e}")
    finally:
        router.reset()
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@ -271,7 +271,7 @@ def test_completion_azure_stream():
        pytest.fail(f"Error occurred: {e}")
-# test_completion_azure_stream()
+test_completion_azure_stream()
 def test_completion_azure_function_calling_stream():
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -6739,7 +6739,10 @@ class CustomStreamWrapper:
                if str_line.choices[0].finish_reason:
                    is_finished = True
                    finish_reason = str_line.choices[0].finish_reason
-                if str_line.choices[0].logprobs is not None:
+                if (
                    "logprobs" in str_line.choices[0]
                    and str_line.choices[0].logprobs is not None
                ):
                    logprobs = str_line.choices[0].logprobs
                else:
                    logprobs = None