refactor(openai.py): making it compatible for openai v1

BREAKING CHANGE:
2023-11-11 15:32:14 -08:00 · 2023-11-11 15:32:14 -08:00 · d3323ba637
commit d3323ba637
parent 833c38edeb
12 changed files with 622 additions and 370 deletions
--- a/litellm/init.py
+++ b/litellm/init.py
@ -375,6 +375,7 @@ from .integrations import *
 from .exceptions import (
    AuthenticationError,
    InvalidRequestError,
    BadRequestError,
    RateLimitError,
    ServiceUnavailableError,
    OpenAIError,
--- a/litellm/exceptions.py
+++ b/litellm/exceptions.py
@ -8,75 +8,82 @@
 #  Thank you users! We ❤️ you! - Krrish & Ishaan
 ## LiteLLM versions of the OpenAI Exception Types
-from openai.error import (
+
 from openai import (
    AuthenticationError,
-    InvalidRequestError,
+    BadRequestError,
    RateLimitError,
-    ServiceUnavailableError,
+    APIStatusError,
    OpenAIError,
    APIError, 
-    Timeout, 
+    APITimeoutError, 
    APIConnectionError, 
 )
-
+import httpx
 class AuthenticationError(AuthenticationError):  # type: ignore
-    def __init__(self, message, llm_provider, model):
+    def __init__(self, message, llm_provider, model, response: httpx.Response):
        self.status_code = 401
        self.message = message
        self.llm_provider = llm_provider
        self.model = model
        super().__init__(
-            self.message
+            self.message,
            response=response,
            body=None
        )  # Call the base class constructor with the parameters it needs
-
+class BadRequestError(BadRequestError):  # type: ignore
-class InvalidRequestError(InvalidRequestError):  # type: ignore
+    def __init__(self, message, model, llm_provider, response: httpx.Response):
    def __init__(self, message, model, llm_provider):
        self.status_code = 400
        self.message = message
        self.model = model
        self.llm_provider = llm_provider
        super().__init__(
-            self.message, f"{self.model}"
+            self.message,
            response=response,
            body=None
        )  # Call the base class constructor with the parameters it needs
-class Timeout(Timeout):  # type: ignore
+class Timeout(APITimeoutError):  # type: ignore
-    def __init__(self, message, model, llm_provider):
+    def __init__(self, message, model, llm_provider, request: httpx.Request):
        self.status_code = 408
        self.message = message
        self.model = model
        self.llm_provider = llm_provider
        super().__init__(
-            self.message, f"{self.model}"
+            request=request
        )  # Call the base class constructor with the parameters it needs
 # sub class of invalid request error - meant to give more granularity for error handling context window exceeded errors
 class ContextWindowExceededError(InvalidRequestError):  # type: ignore
    def __init__(self, message, model, llm_provider):
        self.status_code = 400
        self.message = message
        self.model = model
        self.llm_provider = llm_provider
        super().__init__(
            self.message, self.model, self.llm_provider
        )  # Call the base class constructor with the parameters it needs
 class RateLimitError(RateLimitError):  # type: ignore
-    def __init__(self, message, llm_provider, model):
+    def __init__(self, message, llm_provider, model, response: httpx.Response):
        self.status_code = 429
        self.message = message
        self.llm_provider = llm_provider
        self.modle = model
        super().__init__(
-            self.message
+            self.message,
            response=response,
            body=None
        )  # Call the base class constructor with the parameters it needs
 # sub class of rate limit error - meant to give more granularity for error handling context window exceeded errors
 class ContextWindowExceededError(BadRequestError):  # type: ignore
    def __init__(self, message, model, llm_provider, response: httpx.Response):
        self.status_code = 400
        self.message = message
        self.model = model
        self.llm_provider = llm_provider
        super().__init__(
            message=self.message, 
            model=self.model, 
            llm_provider=self.llm_provider, 
            response=response
        )  # Call the base class constructor with the parameters it needs
-class ServiceUnavailableError(ServiceUnavailableError):  # type: ignore
+class ServiceUnavailableError(APIStatusError):  # type: ignore
    def __init__(self, message, llm_provider, model):
-        self.status_code = 500
+        self.status_code = 503
        self.message = message
        self.llm_provider = llm_provider
        self.model = model
@ -87,13 +94,14 @@ class ServiceUnavailableError(ServiceUnavailableError):  # type: ignore
 # raise this when the API returns an invalid response object - https://github.com/openai/openai-python/blob/1be14ee34a0f8e42d3f9aa5451aa4cb161f1781f/openai/api_requestor.py#L401
 class APIError(APIError): # type: ignore 
-    def __init__(self, status_code, message, llm_provider, model):
+    def __init__(self, status_code, message, llm_provider, model, request: httpx.Request):
        self.status_code = status_code 
        self.message = message
        self.llm_provider = llm_provider
        self.model = model
        super().__init__(
-            self.message
+            self.message,
            request=request
        )
 # raised if an invalid request (not get, delete, put, post) is made
@ -124,3 +132,14 @@ class BudgetExceededError(Exception):
        self.max_budget = max_budget
        message = f"Budget has been exceeded! Current cost: {current_cost}, Max budget: {max_budget}"
        super().__init__(message)
 ## DEPRECATED ## 
 class InvalidRequestError(BadRequestError):  # type: ignore
    def __init__(self, message, model, llm_provider):
        self.status_code = 400
        self.message = message
        self.model = model
        self.llm_provider = llm_provider
        super().__init__(
            self.message, f"{self.model}"
        )  # Call the base class constructor with the parameters it needs
--- a/litellm/llms/base.py
+++ b/litellm/llms/base.py
@ -1,15 +1,20 @@
 ## This is a template base class to be used for adding new LLM providers via API calls
 import litellm 
-import requests, certifi, ssl
+import httpx, certifi, ssl
 class BaseLLM:
    _client_session = None
    def create_client_session(self):
        if litellm.client_session: 
-            session = litellm.client_session
+            _client_session = litellm.client_session
        else: 
-            session = requests.Session()
+            _client_session = httpx.Client(timeout=600)
-        return session
+        return _client_session
    def __exit__(self):
        if hasattr(self, '_client_session'):
            self._client_session.close()
    def validate_environment(self):  # set up the environment required to run the model
        pass
--- a/litellm/llms/openai.py
+++ b/litellm/llms/openai.py
@ -1,14 +1,17 @@
 from typing import Optional, Union
-import types, requests
+import types
 import httpx
 from .base import BaseLLM
 from litellm.utils import ModelResponse, Choices, Message, CustomStreamWrapper, convert_to_model_response_object
 from typing import Callable, Optional
 import aiohttp
 class OpenAIError(Exception):
-    def __init__(self, status_code, message):
+    def __init__(self, status_code, message, request: httpx.Request, response: httpx.Response):
        self.status_code = status_code
        self.message = message
        self.request = request
        self.response = response
        super().__init__(
            self.message
        )  # Call the base class constructor with the parameters it needs
@ -144,7 +147,7 @@ class OpenAITextCompletionConfig():
                and v is not None}
 class OpenAIChatCompletion(BaseLLM):
-    _client_session: requests.Session
+    _client_session: httpx.Client
    def __init__(self) -> None:
        super().__init__()
@ -200,18 +203,8 @@ class OpenAIChatCompletion(BaseLLM):
                            return self.async_streaming(logging_obj=logging_obj, api_base=api_base, data=data, headers=headers, model_response=model_response, model=model)
                        else:
                            return self.acompletion(api_base=api_base, data=data, headers=headers, model_response=model_response)
-                    elif "stream" in optional_params and optional_params["stream"] == True:
+                    elif optional_params.get("stream", False):
-                        response = self._client_session.post(
+                        return self.streaming(logging_obj=logging_obj, api_base=api_base, data=data, headers=headers, model_response=model_response, model=model)
                            url=api_base,
                            json=data,
                            headers=headers,
                            stream=optional_params["stream"]
                        )
                        if response.status_code != 200:
                            raise OpenAIError(status_code=response.status_code, message=response.text)
                        ## RESPONSE OBJECT
                        return response.iter_lines()
                    else:
                        response = self._client_session.post(
                            url=api_base,
@ -219,7 +212,7 @@ class OpenAIChatCompletion(BaseLLM):
                            headers=headers,
                        )
                        if response.status_code != 200:
-                            raise OpenAIError(status_code=response.status_code, message=response.text)
+                            raise OpenAIError(status_code=response.status_code, message=response.text, request=response.request, response=response)
                        ## RESPONSE OBJECT
                        return convert_to_model_response_object(response_object=response.json(), model_response_object=model_response)
@ -246,41 +239,64 @@ class OpenAIChatCompletion(BaseLLM):
            exception_mapping_worked = True
            raise e
        except Exception as e: 
-            if exception_mapping_worked: 
+            raise e
                raise e
            else: 
                import traceback
                raise OpenAIError(status_code=500, message=traceback.format_exc())
    async def acompletion(self, 
                          api_base: str, 
                          data: dict, headers: dict, 
                          model_response: ModelResponse): 
-        async with aiohttp.ClientSession() as session:
+        async with httpx.AsyncClient() as client:
-            async with session.post(api_base, json=data, headers=headers, ssl=False) as response:
+            response = await client.post(api_base, json=data, headers=headers) 
-                response_json = await response.json()
+            response_json = response.json()
-                if response.status != 200:
+            if response.status != 200:
-                    raise OpenAIError(status_code=response.status, message=response.text)
+                raise OpenAIError(status_code=response.status, message=response.text)
-                ## RESPONSE OBJECT
+            ## RESPONSE OBJECT
-                return convert_to_model_response_object(response_object=response_json, model_response_object=model_response)
+            return convert_to_model_response_object(response_object=response_json, model_response_object=model_response)
    def streaming(self,
                  logging_obj,
                  api_base: str, 
                  data: dict, 
                  headers: dict, 
                  model_response: ModelResponse, 
                  model: str
    ):
        with self._client_session.stream(
                    url=f"{api_base}",
                    json=data,
                    headers=headers,
                    method="POST"
                ) as response: 
                    if response.status_code != 200:
                        raise OpenAIError(status_code=response.status_code, message=response.text(), request=self._client_session.request, response=response)
                    completion_stream = response.iter_lines()
                    streamwrapper = CustomStreamWrapper(completion_stream=completion_stream, model=model, custom_llm_provider="openai",logging_obj=logging_obj)
                    for transformed_chunk in streamwrapper:
                        yield transformed_chunk
    async def async_streaming(self, 
                          logging_obj,
                          api_base: str, 
-                          data: dict, headers: dict, 
+                          data: dict, 
                          headers: dict, 
                          model_response: ModelResponse, 
                          model: str):
-        async with aiohttp.ClientSession() as session:
+        client = httpx.AsyncClient()
-            async with session.post(api_base, json=data, headers=headers, ssl=False) as response:
+        async with client.stream(
-                # Check if the request was successful (status code 200)
+                    url=f"{api_base}",
-                if response.status != 200:
+                    json=data,
-                    raise OpenAIError(status_code=response.status, message=await response.text())
+                    headers=headers,
                    method="POST"
                ) as response: 
            if response.status_code != 200:
                raise OpenAIError(status_code=response.status_code, message=response.text(), request=self._client_session.request, response=response)
-                streamwrapper = CustomStreamWrapper(completion_stream=response, model=model, custom_llm_provider="openai",logging_obj=logging_obj)
+            streamwrapper = CustomStreamWrapper(completion_stream=response.aiter_lines(), model=model, custom_llm_provider="openai",logging_obj=logging_obj)
-                async for transformed_chunk in streamwrapper:
+            async for transformed_chunk in streamwrapper:
-                    yield transformed_chunk
+                yield transformed_chunk
    def embedding(self,
                model: str,
@ -349,7 +365,7 @@ class OpenAIChatCompletion(BaseLLM):
 class OpenAITextCompletion(BaseLLM):
-    _client_session: requests.Session
+    _client_session: httpx.Client
    def __init__(self) -> None:
        super().__init__()
@ -367,7 +383,7 @@ class OpenAITextCompletion(BaseLLM):
        try: 
            ## RESPONSE OBJECT
            if response_object is None or model_response_object is None:
-                raise OpenAIError(status_code=500, message="Error in response object format")
+                raise ValueError(message="Error in response object format")
            choice_list=[]
            for idx, choice in enumerate(response_object["choices"]): 
                message = Message(content=choice["text"], role="assistant")
@ -386,8 +402,8 @@ class OpenAITextCompletion(BaseLLM):
            model_response_object._hidden_params["original_response"] = response_object # track original response, if users make a litellm.text_completion() request, we can return the original response
            return model_response_object
-        except: 
+        except Exception as e: 
-            OpenAIError(status_code=500, message="Invalid response object.")
+            raise e
    def completion(self, 
               model: Optional[str]=None,
@ -397,6 +413,7 @@ class OpenAITextCompletion(BaseLLM):
               api_key: Optional[str]=None,
               api_base: Optional[str]=None,
               logging_obj=None,
               acompletion: bool = False,
               optional_params=None,
               litellm_params=None,
               logger_fn=None,
@ -412,9 +429,6 @@ class OpenAITextCompletion(BaseLLM):
            api_base = f"{api_base}/completions"
            if len(messages)>0 and "content" in messages[0] and type(messages[0]["content"]) == list: 
                # Note: internal logic - for enabling litellm.text_completion()
                # text-davinci-003 can accept a string or array, if it's an array, assume the array is set in messages[0]['content']
                # https://platform.openai.com/docs/api-reference/completions/create
                prompt = messages[0]["content"]
            else:
                prompt = " ".join([message["content"] for message in messages]) # type: ignore
@ -431,19 +445,13 @@ class OpenAITextCompletion(BaseLLM):
                api_key=api_key,
                additional_args={"headers": headers, "api_base": api_base, "data": data},
            )
-
+            if acompletion == True: 
-            if "stream" in optional_params and optional_params["stream"] == True:
+                if optional_params.get("stream", False):
-                response = self._client_session.post(
+                    return self.async_streaming(logging_obj=logging_obj, api_base=api_base, data=data, headers=headers, model_response=model_response, model=model)
-                    url=f"{api_base}",
+                else:
-                    json=data,
+                    return self.acompletion(api_base=api_base, data=data, headers=headers, model_response=model_response, prompt=prompt, api_key=api_key, logging_obj=logging_obj, model=model)
-                    headers=headers,
+            elif optional_params.get("stream", False):
-                    stream=optional_params["stream"]
+                return self.streaming(logging_obj=logging_obj, api_base=api_base, data=data, headers=headers, model_response=model_response, model=model)
                )
                if response.status_code != 200:
                    raise OpenAIError(status_code=response.status_code, message=response.text)
                ## RESPONSE OBJECT
                return response.iter_lines()
            else:
                response = self._client_session.post(
                    url=f"{api_base}",
@ -451,7 +459,7 @@ class OpenAITextCompletion(BaseLLM):
                    headers=headers,
                )
                if response.status_code != 200:
-                    raise OpenAIError(status_code=response.status_code, message=response.text)
+                    raise OpenAIError(status_code=response.status_code, message=response.text, request=self._client_session.request, response=response)
                ## LOGGING
                logging_obj.post_call(
@ -466,12 +474,76 @@ class OpenAITextCompletion(BaseLLM):
                ## RESPONSE OBJECT
                return self.convert_to_model_response_object(response_object=response.json(), model_response_object=model_response)
        except OpenAIError as e: 
            exception_mapping_worked = True
            raise e
        except Exception as e: 
-            if exception_mapping_worked: 
+            raise e
-                raise e
+    
-            else: 
+    async def acompletion(self, 
-                import traceback
+                        logging_obj,
-                raise OpenAIError(status_code=500, message=traceback.format_exc())
+                        api_base: str, 
                        data: dict, 
                        headers: dict, 
                        model_response: ModelResponse, 
                        prompt: str, 
                        api_key: str, 
                        model: str): 
        async with httpx.AsyncClient() as client:
            response = await client.post(api_base, json=data, headers=headers) 
            response_json = response.json()
            if response.status_code != 200:
                raise OpenAIError(status_code=response.status_code, message=response.text)
            ## LOGGING
            logging_obj.post_call(
                input=prompt,
                api_key=api_key,
                original_response=response,
                additional_args={
                    "headers": headers,
                    "api_base": api_base,
                },
            )
            ## RESPONSE OBJECT
            return self.convert_to_model_response_object(response_object=response_json, model_response_object=model_response)
    def streaming(self,
                  logging_obj,
                  api_base: str, 
                  data: dict, 
                  headers: dict, 
                  model_response: ModelResponse, 
                  model: str
    ):
        with self._client_session.stream(
                    url=f"{api_base}",
                    json=data,
                    headers=headers,
                    method="POST"
                ) as response: 
                    if response.status_code != 200:
                        raise OpenAIError(status_code=response.status_code, message=response.text(), request=self._client_session.request, response=response)
                    streamwrapper = CustomStreamWrapper(completion_stream=response.iter_lines(), model=model, custom_llm_provider="text-completion-openai",logging_obj=logging_obj)
                    for transformed_chunk in streamwrapper:
                        yield transformed_chunk
    async def async_streaming(self, 
                          logging_obj,
                          api_base: str, 
                          data: dict, 
                          headers: dict, 
                          model_response: ModelResponse, 
                          model: str):
        client = httpx.AsyncClient()
        async with client.stream(
                    url=f"{api_base}",
                    json=data,
                    headers=headers,
                    method="POST"
                ) as response: 
            if response.status_code != 200:
                raise OpenAIError(status_code=response.status_code, message=response.text(), request=self._client_session.request, response=response)
            streamwrapper = CustomStreamWrapper(completion_stream=response.aiter_lines(), model=model, custom_llm_provider="text-completion-openai",logging_obj=logging_obj)
            async for transformed_chunk in streamwrapper:
                yield transformed_chunk
--- a/litellm/main.py
+++ b/litellm/main.py
@ -138,8 +138,10 @@ async def acompletion(*args, **kwargs):
    _, custom_llm_provider, _, _ = get_llm_provider(model=model, api_base=kwargs.get("api_base", None))
-
+    if (custom_llm_provider == "openai" 
-    if (custom_llm_provider == "openai" or custom_llm_provider == "azure" or custom_llm_provider == "custom_openai"): # currently implemented aiohttp calls for just azure and openai, soon all. 
+        or custom_llm_provider == "azure" 
        or custom_llm_provider == "custom_openai"
        or custom_llm_provider == "text-completion-openai"): # currently implemented aiohttp calls for just azure and openai, soon all. 
        if kwargs.get("stream", False): 
            response = completion(*args, **kwargs)
        else:
@ -596,15 +598,16 @@ def completion(
                print_verbose=print_verbose,
                api_key=api_key,
                api_base=api_base,
                acompletion=acompletion,
                logging_obj=logging,
                optional_params=optional_params,
                litellm_params=litellm_params,
                logger_fn=logger_fn
            )
-            if "stream" in optional_params and optional_params["stream"] == True:
+            # if "stream" in optional_params and optional_params["stream"] == True:
-                response = CustomStreamWrapper(model_response, model, custom_llm_provider="text-completion-openai", logging_obj=logging)
+            #     response = CustomStreamWrapper(model_response, model, custom_llm_provider="text-completion-openai", logging_obj=logging)
-                return response
+            #     return response
            response = model_response
        elif (
            "replicate" in model or 
--- a/litellm/tests/test_async_fn.py
+++ b/litellm/tests/test_async_fn.py
@ -28,15 +28,13 @@ def test_async_response():
        user_message = "Hello, how are you?"
        messages = [{"content": user_message, "role": "user"}]
        try:
-            response = await acompletion(model="gpt-3.5-turbo", messages=messages)
+            response = await acompletion(model="gpt-3.5-turbo-instruct", messages=messages)
            print(f"response: {response}")
            response = await acompletion(model="azure/chatgpt-v-2", messages=messages)
            print(f"response: {response}")
        except Exception as e:
            pytest.fail(f"An exception occurred: {e}")
    response = asyncio.run(test_get_response())
-# print(response)
+    print(response)
 # test_async_response()
 def test_get_response_streaming():
@ -45,8 +43,7 @@ def test_get_response_streaming():
        user_message = "write a short poem in one sentence"
        messages = [{"content": user_message, "role": "user"}]
        try:
-            response = await acompletion(model="azure/chatgpt-v-2", messages=messages, stream=True)
+            response = await acompletion(model="gpt-3.5-turbo-instruct", messages=messages, stream=True)
            # response = await acompletion(model="gpt-3.5-turbo", messages=messages, stream=True)
            print(type(response))
            import inspect
@ -59,18 +56,17 @@ def test_get_response_streaming():
            async for chunk in response:
                token = chunk["choices"][0]["delta"].get("content", "")
                output += token
            print(f"output: {output}")
            assert output is not None, "output cannot be None."
            assert isinstance(output, str), "output needs to be of type str"
-            assert len(output) > 0, f"Length of output needs to be greater than 0. {output}"
+            assert len(output) > 0, "Length of output needs to be greater than 0."
-
+            print(f'output: {output}')
        except Exception as e:
            pytest.fail(f"An exception occurred: {e}")
        return response
    asyncio.run(test_async_call())
-test_get_response_streaming()
+# test_get_response_streaming()
 def test_get_response_non_openai_streaming():
    import asyncio
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -9,7 +9,7 @@ sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import pytest
-from openai.error import Timeout
+from openai import Timeout
 import litellm
 from litellm import embedding, completion, completion_cost
 from litellm import RateLimitError
@ -405,7 +405,6 @@ def test_completion_openai():
        litellm.api_key = os.environ['OPENAI_API_KEY']
        response = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=10, request_timeout=10)
        print("This is the response object\n", response)
        print("\n\nThis is response ms:", response.response_ms)
        response_str = response["choices"][0]["message"]["content"]
@ -422,14 +421,15 @@ def test_completion_openai():
        pass
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
-# test_completion_openai()
+test_completion_openai()
 def test_completion_text_openai():
    try:
-        litellm.set_verbose = True
+        # litellm.set_verbose = True
        response = completion(model="gpt-3.5-turbo-instruct", messages=messages)
-        print(response)
+        print(response["choices"][0]["message"]["content"])
    except Exception as e:
        print(e)
        pytest.fail(f"Error occurred: {e}")
 # test_completion_text_openai()
--- a/litellm/tests/test_completion_with_retries.py
+++ b/litellm/tests/test_completion_with_retries.py
@ -14,7 +14,7 @@ import litellm
 from litellm import completion_with_retries, completion
 from litellm import (
    AuthenticationError,
-    InvalidRequestError,
+    BadRequestError,
    RateLimitError,
    ServiceUnavailableError,
    OpenAIError,
--- a/litellm/tests/test_exceptions.py
+++ b/litellm/tests/test_exceptions.py
@ -1,4 +1,8 @@
-from openai.error import AuthenticationError, InvalidRequestError, RateLimitError, OpenAIError
+try:
    from openai import AuthenticationError, BadRequestError, RateLimitError, OpenAIError
 except: 
    from openai.error import AuthenticationError, InvalidRequestError, RateLimitError, OpenAIError
 import os
 import sys
 import traceback
@ -38,23 +42,24 @@ models = ["command-nightly"]
 # Test 1: Context Window Errors 
@pytest.mark.parametrize("model", models)
 def test_context_window(model):
-    sample_text = "Say error 50 times" * 100000
+    sample_text = "Say error 50 times" * 10000
    messages = [{"content": sample_text, "role": "user"}]
    print(f"model: {model}")
    try:
-        completion(model=model, messages=messages)
+        response = completion(model=model, messages=messages)
        print(f"response: {response}")
        print("FAILED!")
        pytest.fail(f"An exception occurred")
-    except ContextWindowExceededError:
+    except ContextWindowExceededError as e:
-        pass
+        print(f"Worked!")
    except RateLimitError:
-        pass
+        print("RateLimited!")
    except Exception as e: 
        print(f"{e}")
        pytest.fail(f"An error occcurred - {e}")
@pytest.mark.parametrize("model", models)
 def test_context_window_with_fallbacks(model):
-    ctx_window_fallback_dict = {"command-nightly": "claude-2"}
+    ctx_window_fallback_dict = {"command-nightly": "claude-2", "gpt-3.5-turbo-instruct": "gpt-3.5-turbo-16k"}
    sample_text = "how does a court case get to the Supreme Court?" * 1000
    messages = [{"content": sample_text, "role": "user"}]
@ -62,8 +67,8 @@ def test_context_window_with_fallbacks(model):
 # for model in litellm.models_by_provider["bedrock"]:
 #     test_context_window(model=model)
-# test_context_window(model="gpt-3.5-turbo-instruct")
+# test_context_window(model="gpt-3.5-turbo")
-# test_context_window_with_fallbacks(model="command-nightly")
+# test_context_window_with_fallbacks(model="gpt-3.5-turbo")
 # Test 2: InvalidAuth Errors
@pytest.mark.parametrize("model", models)
 def invalid_auth(model):  # set the model key to an invalid key, depending on the model
@ -158,14 +163,14 @@ def invalid_auth(model):  # set the model key to an invalid key, depending on th
 # for model in litellm.models_by_provider["bedrock"]:
 #     invalid_auth(model=model)
-# invalid_auth(model="gpt-3.5-turbo-instruct")
+# invalid_auth(model="gpt-3.5-turbo")
 # Test 3: Invalid Request Error 
@pytest.mark.parametrize("model", models)
 def test_invalid_request_error(model):
    messages = [{"content": "hey, how's it going?", "role": "user"}]
-    with pytest.raises(InvalidRequestError):
+    with pytest.raises(BadRequestError):
        completion(model=model, messages=messages, max_tokens="hello world")
 # test_invalid_request_error(model="gpt-3.5-turbo")
@ -178,15 +183,16 @@ def test_invalid_request_error(model):
 #         response = completion(model=model, messages=messages)
 #     except RateLimitError:
 #         return True
-#     except OpenAIError: # is at least an openai error -> in case of random model errors - e.g. overloaded server
+#     # except OpenAIError: # is at least an openai error -> in case of random model errors - e.g. overloaded server
-#         return True
+#     #     return True
 #     except Exception as e:
 #         print(f"Uncaught Exception {model}: {type(e).__name__} - {e}")
 #         traceback.print_exc()
 #         pass
 #     return False
 # # Repeat each model 500 times
-# extended_models = [model for model in models for _ in range(250)]
+# # extended_models = [model for model in models for _ in range(250)]
 # extended_models = ["gpt-3.5-turbo-instruct" for _ in range(250)]
 # def worker(model):
 #     return test_model_call(model)
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@ -11,7 +11,7 @@ sys.path.insert(
 from dotenv import load_dotenv
 load_dotenv()
 import litellm
-from litellm import completion, acompletion, AuthenticationError, InvalidRequestError, RateLimitError
+from litellm import completion, acompletion, AuthenticationError, BadRequestError, RateLimitError, ModelResponse
 litellm.logging = False
 litellm.set_verbose = False
@ -47,38 +47,17 @@ first_openai_chunk_example = {
 def validate_first_format(chunk):
    # write a test to make sure chunk follows the same format as first_openai_chunk_example
-    assert isinstance(chunk, dict), "Chunk should be a dictionary."
+    assert isinstance(chunk, ModelResponse), "Chunk should be a dictionary."
    assert "id" in chunk, "Chunk should have an 'id'."
    assert isinstance(chunk['id'], str), "'id' should be a string."
    assert "object" in chunk, "Chunk should have an 'object'."
    assert isinstance(chunk['object'], str), "'object' should be a string."
    assert "created" in chunk, "Chunk should have a 'created'."
    assert isinstance(chunk['created'], int), "'created' should be an integer."
    assert "model" in chunk, "Chunk should have a 'model'."
    assert isinstance(chunk['model'], str), "'model' should be a string."
    assert "choices" in chunk, "Chunk should have 'choices'."
    assert isinstance(chunk['choices'], list), "'choices' should be a list."
    for choice in chunk['choices']:
        assert isinstance(choice, dict), "Each choice should be a dictionary."
        assert "index" in choice, "Each choice should have 'index'."
        assert isinstance(choice['index'], int), "'index' should be an integer."
        assert "delta" in choice, "Each choice should have 'delta'." 
        assert isinstance(choice['delta'], dict), "'delta' should be a dictionary."
        assert "role" in choice['delta'], "'delta' should have a 'role'."
        assert isinstance(choice['delta']['role'], str), "'role' should be a string."
        assert "content" in choice['delta'], "'delta' should have 'content'."
        assert isinstance(choice['delta']['content'], str), "'content' should be a string."
        assert "finish_reason" in choice, "Each choice should have 'finish_reason'."
        assert (choice['finish_reason'] is None) or isinstance(choice['finish_reason'], str), "'finish_reason' should be None or a string."
 second_openai_chunk_example = {
@ -98,35 +77,16 @@ second_openai_chunk_example = {
 }
 def validate_second_format(chunk):
-    assert isinstance(chunk, dict), "Chunk should be a dictionary."
+    assert isinstance(chunk, ModelResponse), "Chunk should be a dictionary."
    assert "id" in chunk, "Chunk should have an 'id'."
    assert isinstance(chunk['id'], str), "'id' should be a string."
    assert "object" in chunk, "Chunk should have an 'object'."
    assert isinstance(chunk['object'], str), "'object' should be a string."
    assert "created" in chunk, "Chunk should have a 'created'."
    assert isinstance(chunk['created'], int), "'created' should be an integer."
    assert "model" in chunk, "Chunk should have a 'model'."
    assert isinstance(chunk['model'], str), "'model' should be a string."
    assert "choices" in chunk, "Chunk should have 'choices'."
    assert isinstance(chunk['choices'], list), "'choices' should be a list."
    for choice in chunk['choices']:
        assert isinstance(choice, dict), "Each choice should be a dictionary."
        assert "index" in choice, "Each choice should have 'index'."
        assert isinstance(choice['index'], int), "'index' should be an integer."
        assert "delta" in choice, "Each choice should have 'delta'." 
        assert isinstance(choice['delta'], dict), "'delta' should be a dictionary."
        assert "content" in choice['delta'], "'delta' should have 'content'."
        assert isinstance(choice['delta']['content'], str), "'content' should be a string."
        assert "finish_reason" in choice, "Each choice should have 'finish_reason'."
        assert (choice['finish_reason'] is None) or isinstance(choice['finish_reason'], str), "'finish_reason' should be None or a string."
 last_openai_chunk_example = {
@ -144,32 +104,15 @@ last_openai_chunk_example = {
 }
 def validate_last_format(chunk):
-    assert isinstance(chunk, dict), "Chunk should be a dictionary."
+    assert isinstance(chunk, ModelResponse), "Chunk should be a dictionary."
    assert "id" in chunk, "Chunk should have an 'id'."
    assert isinstance(chunk['id'], str), "'id' should be a string."
    assert "object" in chunk, "Chunk should have an 'object'."
    assert isinstance(chunk['object'], str), "'object' should be a string."
    assert "created" in chunk, "Chunk should have a 'created'."
    assert isinstance(chunk['created'], int), "'created' should be an integer."
    assert "model" in chunk, "Chunk should have a 'model'."
    assert isinstance(chunk['model'], str), "'model' should be a string."
    assert "choices" in chunk, "Chunk should have 'choices'."
    assert isinstance(chunk['choices'], list), "'choices' should be a list."
    for choice in chunk['choices']:
        assert isinstance(choice, dict), "Each choice should be a dictionary."
        assert "index" in choice, "Each choice should have 'index'."
        assert isinstance(choice['index'], int), "'index' should be an integer."
        assert "delta" in choice, "Each choice should have 'delta'." 
        assert isinstance(choice['delta'], dict), "'delta' should be a dictionary."
        assert "finish_reason" in choice, "Each choice should have 'finish_reason'."
        assert isinstance(choice['finish_reason'], str), "'finish_reason' should be a string."
 def streaming_format_tests(idx, chunk):
@ -188,6 +131,7 @@ def streaming_format_tests(idx, chunk):
    if chunk["choices"][0]["finish_reason"]: # ensure finish reason is only in last chunk
        validate_last_format(chunk=chunk)
        finished = True
    print(f"chunk choices: {chunk['choices'][0]['delta']['content']}")
    if "content" in chunk["choices"][0]["delta"]:
        extracted_chunk = chunk["choices"][0]["delta"]["content"]
    print(f"extracted chunk: {extracted_chunk}")
@ -549,7 +493,7 @@ def test_completion_claude_stream_bad_key():
        pytest.fail(f"Error occurred: {e}")
-test_completion_claude_stream_bad_key() 
+# test_completion_claude_stream_bad_key() 
 # test_completion_replicate_stream()
 # def test_completion_vertexai_stream():
@ -824,7 +768,7 @@ def ai21_completion_call_bad_key():
        if complete_response.strip() == "": 
            raise Exception("Empty response received")
        print(f"completion_response: {complete_response}")
-    except InvalidRequestError as e: 
+    except Bad as e: 
        pass
    except:
        pytest.fail(f"error occurred: {traceback.format_exc()}")
@ -885,7 +829,7 @@ def ai21_completion_call_bad_key():
 # test on openai completion call
 def test_openai_chat_completion_call():
    try:
-        litellm.set_verbose = True
+        litellm.set_verbose = False
        response = completion(
            model="gpt-3.5-turbo", messages=messages, stream=True
        )
@ -904,7 +848,7 @@ def test_openai_chat_completion_call():
        print(f"error occurred: {traceback.format_exc()}")
        pass
-# test_openai_chat_completion_call()
+test_openai_chat_completion_call()
 def test_openai_chat_completion_complete_response_call():
    try:
@ -928,6 +872,7 @@ def test_openai_text_completion_call():
        start_time = time.time()
        for idx, chunk in enumerate(response):
            chunk, finished = streaming_format_tests(idx, chunk)
            print(f"chunk: {chunk}")
            complete_response += chunk
            if finished:
                break
@ -939,6 +884,8 @@ def test_openai_text_completion_call():
        print(f"error occurred: {traceback.format_exc()}")
        pass
 # test_openai_text_completion_call()
 # # test on together ai completion call - starcoder
 def test_together_ai_completion_call_starcoder():
    try:
@ -992,7 +939,7 @@ def test_together_ai_completion_call_starcoder_bad_key():
        if complete_response == "":
            raise Exception("Empty response received")
        print(f"complete response: {complete_response}")
-    except InvalidRequestError as e:
+    except BadRequestError as e:
        pass
    except:
        print(f"error occurred: {traceback.format_exc()}")
--- a/litellm/timeout.py
+++ b/litellm/timeout.py
@ -17,7 +17,10 @@ from concurrent import futures
 from inspect import iscoroutinefunction
 from functools import wraps
 from threading import Thread
-from openai.error import Timeout
+try: 
    from openai import Timeout
 except:
    from openai.error import Timeout
 def timeout(timeout_duration: float = 0.0, exception_to_raise=Timeout):
--- a/litellm/utils.py
+++ b/litellm/utils.py